import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
import sklearn
import seaborn as sns
np.random.seed = 42
df=pd.read_csv('../data/players_away_matches.csv')
#removing id and names and saving df without them
df = df.drop(df.columns[0], axis=1)
df.drop(columns=['playerName'], inplace=True)
df.drop(columns=['team_name'], inplace=True)
X_mod, X_val = train_test_split(df, test_size=0.3, random_state=42)
X_val.to_csv('../data/for_validators/soccer_players_validators.csv', index=False)
X_mod.to_csv('../data/for_modellers/soccer_players_modellers.csv', index=False)
df=pd.read_csv('../data/for_modellers/soccer_players_modellers.csv')
null_columns = df.columns[df.isnull().any()]
# Print the null columns
print("Null columns:")
print(null_columns)
Null columns:
Index(['aerials_lost', 'aerials_won', 'aerials_won_pct', 'assisted_shots',
'ball_recoveries', 'blocked_passes', 'blocked_shots',
'blocked_shots_saves', 'blocks', 'carries', 'carries_into_final_third',
'carries_into_penalty_area', 'carry_distance',
'carry_progressive_distance', 'clearances', 'corner_kicks',
'corner_kicks_in', 'corner_kicks_out', 'corner_kicks_straight',
'crosses_into_penalty_area', 'dispossessed', 'dribble_tackles',
'dribble_tackles_pct', 'dribbled_past', 'dribbles',
'dribbles_completed', 'dribbles_completed_pct', 'dribbles_vs', 'errors',
'gca', 'minutes', 'miscontrols', 'npxg', 'nutmegs', 'pass_targets',
'passes', 'passes_blocked', 'passes_completed', 'passes_completed_long',
'passes_completed_medium', 'passes_completed_short', 'passes_dead',
'passes_free_kicks', 'passes_ground', 'passes_head', 'passes_high',
'passes_intercepted', 'passes_into_final_third',
'passes_into_penalty_area', 'passes_left_foot', 'passes_live',
'passes_long', 'passes_low', 'passes_medium', 'passes_offsides',
'passes_oob', 'passes_other_body', 'passes_pct', 'passes_pct_long',
'passes_pct_medium', 'passes_pct_short', 'passes_pressure',
'passes_progressive_distance', 'passes_received', 'passes_received_pct',
'passes_right_foot', 'passes_short', 'passes_switches',
'passes_total_distance', 'pens_conceded', 'pens_won',
'players_dribbled_past', 'pressure_regain_pct', 'pressure_regains',
'pressures', 'pressures_att_3rd', 'pressures_def_3rd',
'pressures_mid_3rd', 'progressive_carries', 'progressive_passes',
'progressive_passes_received', 'sca', 'tackles', 'tackles_att_3rd',
'tackles_def_3rd', 'tackles_interceptions', 'tackles_mid_3rd',
'through_balls', 'throw_ins', 'touches', 'touches_att_3rd',
'touches_att_pen_area', 'touches_def_3rd', 'touches_def_pen_area',
'touches_live_ball', 'touches_mid_3rd', 'xa', 'xg',
'championship_name'],
dtype='object')
null_cols_count = df.isnull().any().sum()
print("Number of columns with null values:", null_cols_count)
Number of columns with null values: 99
print("Null counts for each column:")
df.isnull().sum().sort_values(ascending=False).head(20)
Null counts for each column:
dribbles_completed_pct 26447 dribble_tackles_pct 23101 aerials_won_pct 15866 passes_pct_long 7057 pressure_regain_pct 4735 passes_pct_medium 2492 passes_pct_short 2240 pens_won 2099 pens_conceded 2099 passes_pct 610 passes_received_pct 516 championship_name 459 progressive_carries 71 carries_into_final_third 71 carries_into_penalty_area 71 throw_ins 50 passes_oob 50 touches_def_pen_area 50 passes_received 50 passes_progressive_distance 50 dtype: int64
from matplotlib.colors import LinearSegmentedColormap
numeric_df = df.select_dtypes(include=['number'])
corr = numeric_df.corr()
fcorr = corr[(corr > 0.5) | (corr < -0.3)]
cmap = LinearSegmentedColormap.from_list("custom_cmap", ["blue", "white", "red"])
# Plot the correlation matrix
plt.figure(figsize=(80, 60))
sns.heatmap(fcorr, annot=True, cmap=cmap, vmin=-1, vmax=1, center=0, fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()
We are searching for columns to delete
df.filter(regex='pct').columns
Index(['aerials_won_pct', 'dribble_tackles_pct', 'dribbles_completed_pct',
'passes_pct', 'passes_pct_long', 'passes_pct_medium',
'passes_pct_short', 'passes_received_pct', 'pressure_regain_pct'],
dtype='object')
df[['aerials_won','aerials_lost','aerials_won_pct']]
| aerials_won | aerials_lost | aerials_won_pct | |
|---|---|---|---|
| 0 | 0.0 | 0.0 | NaN |
| 1 | 1.0 | 0.0 | 100.0 |
| 2 | 1.0 | 0.0 | 100.0 |
| 3 | 1.0 | 0.0 | 100.0 |
| 4 | 0.0 | 3.0 | 0.0 |
| ... | ... | ... | ... |
| 53324 | 0.0 | 0.0 | NaN |
| 53325 | 1.0 | 2.0 | 33.3 |
| 53326 | 0.0 | 1.0 | 0.0 |
| 53327 | 0.0 | 0.0 | NaN |
| 53328 | 1.0 | 1.0 | 50.0 |
53329 rows × 3 columns
Firstly, we delete columns with 'pct' in their names because they represent the percentage of some statistic that is already included in other columns.
col_drop = df.filter(regex='pct').columns
df.drop(columns=col_drop, inplace=True)
Now, we delete columns with high correlation coefficients, as well as those that represent redundant information already captured by other columns
threshold = 0.7
numeric_df = df.select_dtypes(include=['number'])
corr = numeric_df.corr()
high_correlation_matrix = corr[(abs(corr) > threshold) & (corr != 1.0)]
for column in high_correlation_matrix.columns:
highest_correlations = high_correlation_matrix[column].dropna()
if not highest_correlations.empty:
print(f"Highest correlations for column '{column}':")
print(highest_correlations)
print()
Highest correlations for column 'assisted_shots': sca 0.802385 xa 0.700308 Name: assisted_shots, dtype: float64 Highest correlations for column 'blocked_passes': blocks 0.895996 Name: blocked_passes, dtype: float64 Highest correlations for column 'blocks': blocked_passes 0.895996 Name: blocks, dtype: float64 Highest correlations for column 'carries': carry_distance 0.887823 carry_progressive_distance 0.797673 pass_targets 0.910050 passes 0.928569 passes_completed 0.931741 passes_completed_medium 0.845934 passes_completed_short 0.834066 passes_ground 0.934277 passes_live 0.950182 passes_medium 0.862063 passes_received 0.970124 passes_right_foot 0.718032 passes_short 0.828325 passes_total_distance 0.846310 touches 0.933470 touches_live_ball 0.949898 touches_mid_3rd 0.881764 Name: carries, dtype: float64 Highest correlations for column 'carries_into_final_third': progressive_carries 0.700473 Name: carries_into_final_third, dtype: float64 Highest correlations for column 'carry_distance': carries 0.887823 carry_progressive_distance 0.949062 pass_targets 0.789850 passes 0.821235 passes_completed 0.824671 passes_completed_medium 0.766727 passes_completed_short 0.704101 passes_ground 0.834417 passes_live 0.843563 passes_medium 0.778882 passes_received 0.854270 passes_total_distance 0.765695 progressive_carries 0.750323 touches 0.821000 touches_live_ball 0.838291 touches_mid_3rd 0.784414 Name: carry_distance, dtype: float64 Highest correlations for column 'carry_progressive_distance': carries 0.797673 carry_distance 0.949062 pass_targets 0.706686 passes 0.731715 passes_completed 0.732076 passes_ground 0.741688 passes_live 0.746378 passes_received 0.765064 progressive_carries 0.785896 touches 0.730432 touches_live_ball 0.741195 Name: carry_progressive_distance, dtype: float64 Highest correlations for column 'corner_kicks': corner_kicks_in 0.743814 corner_kicks_out 0.752653 Name: corner_kicks, dtype: float64 Highest correlations for column 'corner_kicks_in': corner_kicks 0.743814 Name: corner_kicks_in, dtype: float64 Highest correlations for column 'corner_kicks_out': corner_kicks 0.752653 Name: corner_kicks_out, dtype: float64 Highest correlations for column 'dribble_tackles': dribbles_vs 0.700013 Name: dribble_tackles, dtype: float64 Highest correlations for column 'dribbled_past': dribbles_vs 0.862981 Name: dribbled_past, dtype: float64 Highest correlations for column 'dribbles': dribbles_completed 0.871465 players_dribbled_past 0.891752 Name: dribbles, dtype: float64 Highest correlations for column 'dribbles_completed': dribbles 0.871465 players_dribbled_past 0.979029 Name: dribbles_completed, dtype: float64 Highest correlations for column 'dribbles_vs': dribble_tackles 0.700013 dribbled_past 0.862981 Name: dribbles_vs, dtype: float64 Highest correlations for column 'interceptions': tackles_interceptions 0.72691 Name: interceptions, dtype: float64 Highest correlations for column 'minutes': touches 0.734107 touches_live_ball 0.700780 Name: minutes, dtype: float64 Highest correlations for column 'npxg': shots_total 0.710451 xg 0.911297 Name: npxg, dtype: float64 Highest correlations for column 'pass_targets': carries 0.910050 carry_distance 0.789850 carry_progressive_distance 0.706686 passes 0.832753 passes_completed 0.832053 passes_completed_medium 0.729545 passes_completed_short 0.808722 passes_ground 0.846445 passes_live 0.867828 passes_medium 0.755058 passes_received 0.948210 passes_short 0.821342 passes_total_distance 0.714773 touches 0.856298 touches_live_ball 0.885329 touches_mid_3rd 0.833050 Name: pass_targets, dtype: float64 Highest correlations for column 'passes': carries 0.928569 carry_distance 0.821235 carry_progressive_distance 0.731715 pass_targets 0.832753 passes_completed 0.979099 passes_completed_medium 0.901123 passes_completed_short 0.819201 passes_ground 0.934716 passes_live 0.980689 passes_medium 0.921942 passes_progressive_distance 0.731606 passes_received 0.932947 passes_right_foot 0.743326 passes_short 0.811142 passes_total_distance 0.934619 touches 0.981020 touches_live_ball 0.959676 touches_mid_3rd 0.862307 Name: passes, dtype: float64 Highest correlations for column 'passes_completed': carries 0.931741 carry_distance 0.824671 carry_progressive_distance 0.732076 pass_targets 0.832053 passes 0.979099 passes_completed_medium 0.935288 passes_completed_short 0.829897 passes_ground 0.969800 passes_live 0.980667 passes_medium 0.936646 passes_received 0.941563 passes_right_foot 0.750939 passes_short 0.806748 passes_total_distance 0.946188 touches 0.954476 touches_live_ball 0.951318 touches_mid_3rd 0.872481 Name: passes_completed, dtype: float64 Highest correlations for column 'passes_completed_long': passes_long 0.894553 passes_progressive_distance 0.806589 passes_total_distance 0.852206 touches_def_3rd 0.710982 Name: passes_completed_long, dtype: float64 Highest correlations for column 'passes_completed_medium': carries 0.845934 carry_distance 0.766727 pass_targets 0.729545 passes 0.901123 passes_completed 0.935288 passes_ground 0.919530 passes_live 0.913522 passes_medium 0.986425 passes_received 0.858815 passes_right_foot 0.700535 passes_total_distance 0.919726 touches 0.871046 touches_live_ball 0.877326 touches_mid_3rd 0.796076 Name: passes_completed_medium, dtype: float64 Highest correlations for column 'passes_completed_short': carries 0.834066 carry_distance 0.704101 pass_targets 0.808722 passes 0.819201 passes_completed 0.829897 passes_ground 0.817362 passes_live 0.823629 passes_received 0.842823 passes_short 0.986980 touches 0.820249 touches_live_ball 0.821731 touches_mid_3rd 0.811754 Name: passes_completed_short, dtype: float64 Highest correlations for column 'passes_dead': passes_high 0.712027 Name: passes_dead, dtype: float64 Highest correlations for column 'passes_ground': carries 0.934277 carry_distance 0.834417 carry_progressive_distance 0.741688 pass_targets 0.846445 passes 0.934716 passes_completed 0.969800 passes_completed_medium 0.919530 passes_completed_short 0.817362 passes_live 0.963378 passes_medium 0.915410 passes_received 0.946617 passes_right_foot 0.746277 passes_short 0.796345 passes_total_distance 0.894980 touches 0.911640 touches_live_ball 0.932541 touches_mid_3rd 0.864728 Name: passes_ground, dtype: float64 Highest correlations for column 'passes_high': passes_dead 0.712027 passes_long 0.816898 passes_progressive_distance 0.741458 Name: passes_high, dtype: float64 Highest correlations for column 'passes_into_final_third': progressive_passes 0.72586 touches_mid_3rd 0.73174 Name: passes_into_final_third, dtype: float64 Highest correlations for column 'passes_live': carries 0.950182 carry_distance 0.843563 carry_progressive_distance 0.746378 pass_targets 0.867828 passes 0.980689 passes_completed 0.980667 passes_completed_medium 0.913522 passes_completed_short 0.823629 passes_ground 0.963378 passes_medium 0.928450 passes_received 0.962516 passes_right_foot 0.757484 passes_short 0.816100 passes_total_distance 0.920206 touches 0.966149 touches_live_ball 0.978919 touches_mid_3rd 0.894138 Name: passes_live, dtype: float64 Highest correlations for column 'passes_long': passes_completed_long 0.894553 passes_high 0.816898 passes_progressive_distance 0.838354 passes_total_distance 0.746593 touches_def_3rd 0.742694 Name: passes_long, dtype: float64 Highest correlations for column 'passes_medium': carries 0.862063 carry_distance 0.778882 pass_targets 0.755058 passes 0.921942 passes_completed 0.936646 passes_completed_medium 0.986425 passes_ground 0.915410 passes_live 0.928450 passes_received 0.873890 passes_total_distance 0.906930 touches 0.899698 touches_live_ball 0.901190 touches_mid_3rd 0.823866 Name: passes_medium, dtype: float64 Highest correlations for column 'passes_other_body': touches_def_pen_area 0.738015 Name: passes_other_body, dtype: float64 Highest correlations for column 'passes_progressive_distance': passes 0.731606 passes_completed_long 0.806589 passes_high 0.741458 passes_long 0.838354 passes_total_distance 0.827215 touches_def_3rd 0.795014 Name: passes_progressive_distance, dtype: float64 Highest correlations for column 'passes_received': carries 0.970124 carry_distance 0.854270 carry_progressive_distance 0.765064 pass_targets 0.948210 passes 0.932947 passes_completed 0.941563 passes_completed_medium 0.858815 passes_completed_short 0.842823 passes_ground 0.946617 passes_live 0.962516 passes_medium 0.873890 passes_right_foot 0.726258 passes_short 0.838963 passes_total_distance 0.850692 touches 0.933362 touches_live_ball 0.956220 touches_mid_3rd 0.890305 Name: passes_received, dtype: float64 Highest correlations for column 'passes_right_foot': carries 0.718032 passes 0.743326 passes_completed 0.750939 passes_completed_medium 0.700535 passes_ground 0.746277 passes_live 0.757484 passes_received 0.726258 passes_total_distance 0.736669 touches 0.723090 touches_live_ball 0.731926 Name: passes_right_foot, dtype: float64 Highest correlations for column 'passes_short': carries 0.828325 pass_targets 0.821342 passes 0.811142 passes_completed 0.806748 passes_completed_short 0.986980 passes_ground 0.796345 passes_live 0.816100 passes_received 0.838963 touches 0.819769 touches_live_ball 0.822157 touches_mid_3rd 0.814326 Name: passes_short, dtype: float64 Highest correlations for column 'passes_total_distance': carries 0.846310 carry_distance 0.765695 pass_targets 0.714773 passes 0.934619 passes_completed 0.946188 passes_completed_long 0.852206 passes_completed_medium 0.919726 passes_ground 0.894980 passes_live 0.920206 passes_long 0.746593 passes_medium 0.906930 passes_progressive_distance 0.827215 passes_received 0.850692 passes_right_foot 0.736669 touches 0.895703 touches_def_3rd 0.731744 touches_live_ball 0.877680 touches_mid_3rd 0.757201 Name: passes_total_distance, dtype: float64 Highest correlations for column 'pens_att': pens_made 0.898519 Name: pens_att, dtype: float64 Highest correlations for column 'pens_made': pens_att 0.898519 Name: pens_made, dtype: float64 Highest correlations for column 'players_dribbled_past': dribbles 0.891752 dribbles_completed 0.979029 Name: players_dribbled_past, dtype: float64 Highest correlations for column 'pressure_regains': pressures 0.787829 Name: pressure_regains, dtype: float64 Highest correlations for column 'pressures': pressure_regains 0.787829 pressures_mid_3rd 0.886289 Name: pressures, dtype: float64 Highest correlations for column 'pressures_mid_3rd': pressures 0.886289 Name: pressures_mid_3rd, dtype: float64 Highest correlations for column 'progressive_carries': carries_into_final_third 0.700473 carry_distance 0.750323 carry_progressive_distance 0.785896 Name: progressive_carries, dtype: float64 Highest correlations for column 'progressive_passes': passes_into_final_third 0.725860 touches_mid_3rd 0.701927 Name: progressive_passes, dtype: float64 Highest correlations for column 'progressive_passes_received': touches_att_pen_area 0.726256 Name: progressive_passes_received, dtype: float64 Highest correlations for column 'sca': assisted_shots 0.802385 Name: sca, dtype: float64 Highest correlations for column 'shots_total': npxg 0.710451 Name: shots_total, dtype: float64 Highest correlations for column 'tackles': tackles_def_3rd 0.790656 tackles_interceptions 0.864330 tackles_won 0.850674 Name: tackles, dtype: float64 Highest correlations for column 'tackles_def_3rd': tackles 0.790656 tackles_interceptions 0.701239 Name: tackles_def_3rd, dtype: float64 Highest correlations for column 'tackles_interceptions': interceptions 0.726910 tackles 0.864330 tackles_def_3rd 0.701239 tackles_won 0.732648 Name: tackles_interceptions, dtype: float64 Highest correlations for column 'tackles_won': tackles 0.850674 tackles_interceptions 0.732648 Name: tackles_won, dtype: float64 Highest correlations for column 'touches': carries 0.933470 carry_distance 0.821000 carry_progressive_distance 0.730432 minutes 0.734107 pass_targets 0.856298 passes 0.981020 passes_completed 0.954476 passes_completed_medium 0.871046 passes_completed_short 0.820249 passes_ground 0.911640 passes_live 0.966149 passes_medium 0.899698 passes_received 0.933362 passes_right_foot 0.723090 passes_short 0.819769 passes_total_distance 0.895703 touches_live_ball 0.984702 touches_mid_3rd 0.876376 Name: touches, dtype: float64 Highest correlations for column 'touches_att_pen_area': progressive_passes_received 0.726256 Name: touches_att_pen_area, dtype: float64 Highest correlations for column 'touches_def_3rd': passes_completed_long 0.710982 passes_long 0.742694 passes_progressive_distance 0.795014 passes_total_distance 0.731744 touches_def_pen_area 0.726768 Name: touches_def_3rd, dtype: float64 Highest correlations for column 'touches_def_pen_area': passes_other_body 0.738015 touches_def_3rd 0.726768 Name: touches_def_pen_area, dtype: float64 Highest correlations for column 'touches_live_ball': carries 0.949898 carry_distance 0.838291 carry_progressive_distance 0.741195 minutes 0.700780 pass_targets 0.885329 passes 0.959676 passes_completed 0.951318 passes_completed_medium 0.877326 passes_completed_short 0.821731 passes_ground 0.932541 passes_live 0.978919 passes_medium 0.901190 passes_received 0.956220 passes_right_foot 0.731926 passes_short 0.822157 passes_total_distance 0.877680 touches 0.984702 touches_mid_3rd 0.902909 Name: touches_live_ball, dtype: float64 Highest correlations for column 'touches_mid_3rd': carries 0.881764 carry_distance 0.784414 pass_targets 0.833050 passes 0.862307 passes_completed 0.872481 passes_completed_medium 0.796076 passes_completed_short 0.811754 passes_ground 0.864728 passes_into_final_third 0.731740 passes_live 0.894138 passes_medium 0.823866 passes_received 0.890305 passes_short 0.814326 passes_total_distance 0.757201 progressive_passes 0.701927 touches 0.876376 touches_live_ball 0.902909 Name: touches_mid_3rd, dtype: float64 Highest correlations for column 'xa': assisted_shots 0.700308 Name: xa, dtype: float64 Highest correlations for column 'xg': npxg 0.911297 Name: xg, dtype: float64
passes, touches, carries(carr), dribbles(dribble), corner, blocks(block)
passes_columns = df.filter(regex='passes').columns
arr_passes = list()
for col in passes_columns:
if abs(df[col].corr(df['passes'])) > 0.6:
arr_passes.append(col)
arr_passes.pop(0)
arr_passes
['passes_completed', 'passes_completed_long', 'passes_completed_medium', 'passes_completed_short', 'passes_ground', 'passes_into_final_third', 'passes_live', 'passes_long', 'passes_low', 'passes_medium', 'passes_progressive_distance', 'passes_received', 'passes_right_foot', 'passes_short', 'passes_total_distance', 'progressive_passes']
touches_columns = df.filter(regex='touches').columns
arr_touches = list()
for col in touches_columns:
if abs(df[col].corr(df['touches'])) > 0.6:
arr_touches.append(col)
arr_touches.pop(0)
arr_touches
['touches_def_3rd', 'touches_live_ball', 'touches_mid_3rd']
carr_columns = df.filter(regex='carr').columns
arr_carr = list()
for col in carr_columns:
if abs(df[col].corr(df['carries'])) > 0.6:
arr_carr.append(col)
arr_carr.pop(0)
arr_carr
['carry_distance', 'carry_progressive_distance', 'progressive_carries']
dribble_columns = df.filter(regex='dribble').columns
arr_dribble = list()
for col in dribble_columns:
if abs(df[col].corr(df['dribbles'])) > 0.6:
arr_dribble.append(col)
arr_dribble.pop(0)
arr_dribble
['dribbles_completed', 'players_dribbled_past']
corner_columns = df.filter(regex='corner').columns
arr_corner = list()
for col in corner_columns:
if abs(df[col].corr(df['corner_kicks'])) > 0.6:
arr_corner.append(col)
arr_corner.pop(0)
arr_corner
['corner_kicks_in', 'corner_kicks_out']
block_columns = df.filter(regex='block').columns
arr_block = list()
for col in block_columns:
if abs(df[col].corr(df['blocks'])) > 0.6:
arr_block.append(col)
arr_block.pop(1)
arr_block
['blocked_passes']
cols_to_drop = arr_passes+arr_touches+arr_carr+arr_dribble+arr_corner+arr_block
df.drop(columns=cols_to_drop, inplace=True)
threshold = 0.7
numeric_df = df.select_dtypes(include=['number'])
corr = numeric_df.corr()
high_correlation_matrix = corr[(abs(corr) > threshold) & (corr != 1.0)]
for column in high_correlation_matrix.columns:
highest_correlations = high_correlation_matrix[column].dropna()
if not highest_correlations.empty:
print(f"Highest correlations for column '{column}':")
print(highest_correlations)
print()
Highest correlations for column 'assisted_shots': sca 0.802385 xa 0.700308 Name: assisted_shots, dtype: float64 Highest correlations for column 'carries': pass_targets 0.910050 passes 0.928569 touches 0.933470 Name: carries, dtype: float64 Highest correlations for column 'dribble_tackles': dribbles_vs 0.700013 Name: dribble_tackles, dtype: float64 Highest correlations for column 'dribbled_past': dribbles_vs 0.862981 Name: dribbled_past, dtype: float64 Highest correlations for column 'dribbles_vs': dribble_tackles 0.700013 dribbled_past 0.862981 Name: dribbles_vs, dtype: float64 Highest correlations for column 'interceptions': tackles_interceptions 0.72691 Name: interceptions, dtype: float64 Highest correlations for column 'minutes': touches 0.734107 Name: minutes, dtype: float64 Highest correlations for column 'npxg': shots_total 0.710451 xg 0.911297 Name: npxg, dtype: float64 Highest correlations for column 'pass_targets': carries 0.910050 passes 0.832753 touches 0.856298 Name: pass_targets, dtype: float64 Highest correlations for column 'passes': carries 0.928569 pass_targets 0.832753 touches 0.981020 Name: passes, dtype: float64 Highest correlations for column 'passes_dead': passes_high 0.712027 Name: passes_dead, dtype: float64 Highest correlations for column 'passes_high': passes_dead 0.712027 Name: passes_high, dtype: float64 Highest correlations for column 'passes_other_body': touches_def_pen_area 0.738015 Name: passes_other_body, dtype: float64 Highest correlations for column 'pens_att': pens_made 0.898519 Name: pens_att, dtype: float64 Highest correlations for column 'pens_made': pens_att 0.898519 Name: pens_made, dtype: float64 Highest correlations for column 'pressure_regains': pressures 0.787829 Name: pressure_regains, dtype: float64 Highest correlations for column 'pressures': pressure_regains 0.787829 pressures_mid_3rd 0.886289 Name: pressures, dtype: float64 Highest correlations for column 'pressures_mid_3rd': pressures 0.886289 Name: pressures_mid_3rd, dtype: float64 Highest correlations for column 'progressive_passes_received': touches_att_pen_area 0.726256 Name: progressive_passes_received, dtype: float64 Highest correlations for column 'sca': assisted_shots 0.802385 Name: sca, dtype: float64 Highest correlations for column 'shots_total': npxg 0.710451 Name: shots_total, dtype: float64 Highest correlations for column 'tackles': tackles_def_3rd 0.790656 tackles_interceptions 0.864330 tackles_won 0.850674 Name: tackles, dtype: float64 Highest correlations for column 'tackles_def_3rd': tackles 0.790656 tackles_interceptions 0.701239 Name: tackles_def_3rd, dtype: float64 Highest correlations for column 'tackles_interceptions': interceptions 0.726910 tackles 0.864330 tackles_def_3rd 0.701239 tackles_won 0.732648 Name: tackles_interceptions, dtype: float64 Highest correlations for column 'tackles_won': tackles 0.850674 tackles_interceptions 0.732648 Name: tackles_won, dtype: float64 Highest correlations for column 'touches': carries 0.933470 minutes 0.734107 pass_targets 0.856298 passes 0.981020 Name: touches, dtype: float64 Highest correlations for column 'touches_att_pen_area': progressive_passes_received 0.726256 Name: touches_att_pen_area, dtype: float64 Highest correlations for column 'touches_def_pen_area': passes_other_body 0.738015 Name: touches_def_pen_area, dtype: float64 Highest correlations for column 'xa': assisted_shots 0.700308 Name: xa, dtype: float64 Highest correlations for column 'xg': npxg 0.911297 Name: xg, dtype: float64
cols_to_drop2 = ['xg', 'tackles_def_3rd', 'tackles_won', 'tackles_interceptions', 'npxg', 'sca', 'xa', 'pass_targets', 'dribble_tackles', 'dribbled_past', 'tackles_interceptions', 'pens_made', 'pressure_regains', 'pressures_mid_3rd']
df.drop(columns=cols_to_drop2, inplace=True)
cols_to_drop3 = ['touches','carries']
df.drop(columns=cols_to_drop3, inplace=True)
print("Null counts for each column:")
df.isnull().sum().sort_values(ascending=False).head(20)
Null counts for each column:
pens_won 2099 pens_conceded 2099 championship_name 459 carries_into_penalty_area 71 carries_into_final_third 71 passes_other_body 50 dribbles 50 dribbles_vs 50 errors 50 gca 50 passes_switches 50 passes_pressure 50 miscontrols 50 crosses_into_penalty_area 50 nutmegs 50 passes_oob 50 passes_offsides 50 passes 50 passes_blocked 50 passes_dead 50 dtype: int64
null_cols_count = df.isnull().any().sum()
print("Number of columns with null values:", null_cols_count)
Number of columns with null values: 50
df[df['passes'].isna()]
| awayScore | awayTeamName | awayTeamTacticalSchema | homeScore | homeTeamName | homeTeamTacticalSchema | matchDate | matchWeek | aerials_lost | aerials_won | ... | shots_total | tackles | tackles_att_3rd | tackles_mid_3rd | through_balls | throw_ins | touches_att_3rd | touches_att_pen_area | touches_def_pen_area | championship_name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 61 | 1 | Valladolid | (4-4-2) | 4 | Real Sociedad | (4-4-2) | 2021-05-16 | 37 | NaN | NaN | ... | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | liga |
| 2312 | 0 | Rayo Vallecano | (4-2-3-1) | 1 | Real Sociedad | (4-1-4-1) | 2021-08-22 | 2 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | liga |
| 2709 | 1 | Spezia | (4-3-3) | 2 | Lazio | (3-5-2) | 2021-04-03 | 29 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | seria a |
| 4964 | 2 | Spezia | (4-3-3) | 2 | Parma | (4-4-2◆) | 2020-10-25 | 5 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | seria a |
| 5378 | 0 | Valladolid | (4-4-2) | 2 | Atlético Madrid | (3-4-1-2) | 2020-12-05 | 12 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | liga |
| 5732 | 2 | Alavés | (4-3-3) | 0 | Elche | (4-4-2) | 2021-05-11 | 36 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | liga |
| 6520 | 0 | Toulouse | (4-1-4-1) | 1 | Marseille | (4-3-3) | 2020-02-08 | 24 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ligue 1 |
| 7975 | 1 | Villarreal | (4-4-1-1) | 1 | Celta Vigo | (4-4-2◆) | 2021-11-20 | 14 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | liga |
| 9177 | 1 | Rennes | (4-4-2) | 0 | Nîmes | (4-3-3) | 2020-01-15 | 12 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ligue 1 |
| 9558 | 1 | Valladolid | (4-4-2) | 4 | Real Sociedad | (4-4-2) | 2021-05-16 | 37 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | liga |
| 10799 | 2 | Genoa | (3-5-2) | 1 | Milan | (4-2-3-1) | 2020-03-08 | 26 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | seria a |
| 11526 | 0 | Getafe | (4-2-2-2) | 0 | Levante | (3-4-1-2) | 2019-02-02 | 22 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | liga |
| 13297 | 1 | Marseille | (3-4-3) | 1 | Nice | (4-4-2) | 2021-10-27 | 3 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ligue 1 |
| 13325 | 1 | Marseille | (3-4-3) | 1 | Nice | (4-4-2) | 2021-10-27 | 3 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ligue 1 |
| 13959 | 2 | Napoli | (4-2-3-1) | 0 | Fiorentina | (3-5-2) | 2021-05-16 | 37 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | seria a |
| 15550 | 1 | Valladolid | (4-4-2) | 1 | Sevilla | (4-3-3) | 2020-12-19 | 14 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | liga |
| 15605 | 2 | Celta Vigo | (4-3-3) | 3 | Leganés | (3-4-3) | 2019-12-08 | 16 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | liga |
| 16643 | 1 | Leicester City | (3-4-3) | 2 | Chelsea | (3-4-3) | 2021-05-18 | 37 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | premier league |
| 16746 | 1 | Rayo Vallecano | (4-2-3-1) | 1 | Levante | (4-3-3) | 2021-09-11 | 4 | NaN | NaN | ... | 4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | liga |
| 19130 | 0 | Levante | (3-5-2) | 2 | Celta Vigo | (4-4-2◆) | 2021-04-30 | 34 | NaN | NaN | ... | 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | liga |
| 22168 | 1 | Stuttgart | (3-5-2) | 1 | Mönchengladbach | (3-4-3) | 2021-10-16 | 8 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | bundesliga |
| 23217 | 1 | Nürnberg | (4-2-2-2) | 2 | Mainz 05 | (4-4-2◆) | 2019-01-26 | 19 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | bundesliga |
| 24577 | 2 | Spezia | (4-4-2) | 2 | Sampdoria | (4-4-2) | 2021-05-12 | 36 | NaN | NaN | ... | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | seria a |
| 26189 | 1 | Internazionale | (4-2-3-1) | 2 | Cagliari | (4-4-2◆) | 2019-03-01 | 26 | NaN | NaN | ... | 10 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | seria a |
| 27603 | 0 | Real Betis | (4-1-4-1) | 1 | Getafe | (4-4-2) | 2020-01-26 | 21 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | liga |
| 28068 | 1 | Valladolid | (4-4-2) | 1 | Elche | (4-4-2) | 2021-04-21 | 31 | NaN | NaN | ... | 3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | liga |
| 28831 | 1 | Spezia | (4-3-3) | 4 | Crotone | (3-5-2) | 2020-12-12 | 11 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | seria a |
| 30267 | 1 | Marseille | (3-4-3) | 1 | Nice | (4-4-2) | 2021-10-27 | 3 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ligue 1 |
| 30975 | 0 | Osasuna | (4-5-1) | 2 | Real Madrid | (4-2-3-1) | 2021-05-01 | 34 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | liga |
| 31110 | 2 | Levante | (4-4-2) | 2 | Alavés | (4-2-3-1) | 2021-05-08 | 35 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | liga |
| 33383 | 0 | Espanyol | (4-1-4-1) | 1 | Rayo Vallecano | (4-2-3-1) | 2021-12-05 | 16 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | liga |
| 35395 | 1 | Marseille | (3-4-3) | 1 | Nice | (4-4-2) | 2021-10-27 | 3 | NaN | NaN | ... | 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ligue 1 |
| 35886 | 3 | Internazionale | (3-5-2) | 1 | Napoli | (4-3-3) | 2020-01-06 | 18 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | seria a |
| 35907 | 2 | Spezia | (4-3-3) | 2 | Cagliari | (4-2-3-1) | 2020-11-29 | 9 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | seria a |
| 36248 | 1 | Spezia | (3-4-3) | 6 | Lazio | (4-3-3) | 2021-08-28 | 2 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | seria a |
| 37849 | 2 | Rayo Vallecano | (4-2-3-1) | 1 | Athletic Club | (4-4-2) | 2021-09-21 | 6 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | liga |
| 39549 | 0 | Real Betis | (4-2-3-1) | 1 | Atlético Madrid | (4-4-2) | 2020-07-11 | 36 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | liga |
| 40371 | 1 | Valladolid | (4-4-2) | 1 | Eibar | (4-4-2) | 2021-02-13 | 23 | NaN | NaN | ... | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | liga |
| 43149 | 0 | Internazionale | (3-5-2) | 2 | Juventus | (4-3-3) | 2020-03-08 | 26 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | seria a |
| 43296 | 0 | Celta Vigo | (4-3-3) | 0 | Granada | (3-4-3) | 2020-02-29 | 26 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | liga |
| 43672 | 3 | Eintracht Frankfurt | (3-4-1-2) | 0 | Hannover 96 | (3-4-1-2) | 2019-02-24 | 23 | NaN | NaN | ... | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | bundesliga |
| 43698 | 2 | Milan | (4-3-3) | 1 | Genoa | (3-4-1-2) | 2019-10-05 | 7 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | seria a |
| 44826 | 0 | Espanyol | (3-5-2) | 1 | Barcelona | (4-4-2◆) | 2020-07-08 | 35 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | liga |
| 45483 | 1 | Real Betis | (4-1-4-1) | 1 | Eibar | (4-2-3-1) | 2020-02-02 | 22 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | liga |
| 45986 | 2 | Atlético Madrid | (3-5-2) | 1 | Valladolid | (4-4-2) | 2021-05-22 | 38 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | liga |
| 46276 | 2 | Mallorca | (4-1-4-1) | 4 | Getafe | (4-4-2) | 2019-09-22 | 5 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | liga |
| 47642 | 1 | Levante | (4-4-2) | 2 | Getafe | (4-4-2) | 2021-05-16 | 37 | NaN | NaN | ... | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | liga |
| 49736 | 1 | Empoli | (3-5-2) | 2 | Internazionale | (4-2-3-1) | 2019-05-26 | 38 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | seria a |
| 51827 | 1 | Barcelona | (3-4-3) | 2 | Real Madrid | (4-1-4-1) | 2021-04-10 | 30 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | liga |
| 51934 | 2 | Levante | (4-4-2) | 4 | Valencia | (4-4-2) | 2020-09-13 | 1 | NaN | NaN | ... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | liga |
50 rows × 73 columns
It turns out that all 50 rows that contains missing values are matches with no given statistics
df.dropna(subset=['passes'],inplace=True)
null_cols_count = df.isnull().any().sum()
print("Number of columns with null values:", null_cols_count)
Number of columns with null values: 6
print("Null counts for each column:")
df.isnull().sum().sort_values(ascending=False)
Null counts for each column:
pens_won 2053
pens_conceded 2053
championship_name 459
carries_into_final_third 21
carries_into_penalty_area 21
...
goals 0
interceptions 0
awayTeamName 0
miscontrols 0
awayScore 0
Length: 73, dtype: int64
df['pens_won'].fillna(0, inplace=True)
df['pens_conceded'].fillna(0, inplace=True)
df['minutes'].fillna(90, inplace=True)
df['carries_into_final_third'].fillna(0, inplace=True)
df['carries_into_penalty_area'].fillna(0, inplace=True)
df['championship_name'].fillna('unknown', inplace=True)
null_cols_count = df.isnull().any().sum()
print("Number of columns with null values:", null_cols_count)
Number of columns with null values: 0
0 NULLs left - 1.2 done:)
Here we know from EDA that we have some categorical columns that have a lot of different unstructured values - we need to think about what to do with this
cat_cols = df.select_dtypes(include='object').columns
print(f'categorical columns: {cat_cols}')
print(f'number of categorical columns: {len(cat_cols)}')
categorical columns: Index(['awayTeamName', 'awayTeamTacticalSchema', 'homeTeamName',
'homeTeamTacticalSchema', 'matchDate', 'championship_name'],
dtype='object')
number of categorical columns: 6
unique_values_count = df[cat_cols].nunique()
print(unique_values_count)
awayTeamName 129 awayTeamTacticalSchema 32 homeTeamName 122 homeTeamTacticalSchema 32 matchDate 565 championship_name 6 dtype: int64
unique_values = df['awayTeamTacticalSchema'].unique()
print(unique_values)
['(4-4-2)' '(4-2-3-1)' '(3-5-2)' '(4-5-1)' '(3-4-1-2)' '(4-1-4-1)' '(4-3-3)' '(3-4-3)' '(5-1-2-2)' '(4-4-2◆)' '(4-4-1-1)' '(4-2-2-2)' '(3-5-1-1)' '(3-2-2-2-1)' '(3-2-3-1-1)' '(4-1-3-2)' '(3-2-2-1-2)' '(5-3-2)' '(5-4-1)' '(3-2-3-2)' '(4-2-3-1◆)' '(4-3-2-1)' '(3-1-4-1-1)' '(4-2-2-1-1)' '(4-3-1-2)' '(3-1-4-2)' '(3-4-3◆)' '(3-3-2-2)' '(3-2-4-1)' '(3-3-2-1-1)' 'United' 'Marseille']
We have some incorrect values here, let's change/remove them.
df[df['awayTeamTacticalSchema']=='United']
| awayScore | awayTeamName | awayTeamTacticalSchema | homeScore | homeTeamName | homeTeamTacticalSchema | matchDate | matchWeek | aerials_lost | aerials_won | ... | shots_total | tackles | tackles_att_3rd | tackles_mid_3rd | through_balls | throw_ins | touches_att_3rd | touches_att_pen_area | touches_def_pen_area | championship_name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4286 | 1 | Leeds United | United | 1 | Burnley | (4-4-2) | 2021-08-29 | 3 | 5.0 | 4.0 | ... | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 12.0 | premier league |
| 22856 | 1 | Leeds United | United | 1 | Burnley | (4-4-2) | 2021-08-29 | 3 | 3.0 | 1.0 | ... | 3 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 14.0 | 10.0 | 1.0 | premier league |
| 47938 | 1 | Leeds United | United | 1 | Burnley | (4-4-2) | 2021-08-29 | 3 | 4.0 | 2.0 | ... | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | 19.0 | 1.0 | 2.0 | premier league |
| 48081 | 1 | Leeds United | United | 1 | Burnley | (4-4-2) | 2021-08-29 | 3 | 0.0 | 0.0 | ... | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 44.0 | premier league |
| 48988 | 1 | Leeds United | United | 1 | Burnley | (4-4-2) | 2021-08-29 | 3 | 0.0 | 0.0 | ... | 2 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 7.0 | 2.0 | 0.0 | premier league |
| 53264 | 1 | Leeds United | United | 1 | Burnley | (4-4-2) | 2021-08-29 | 3 | 3.0 | 5.0 | ... | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 5.0 | 2.0 | 6.0 | premier league |
6 rows × 73 columns
df[df['awayTeamTacticalSchema']=='Marseille']
| awayScore | awayTeamName | awayTeamTacticalSchema | homeScore | homeTeamName | homeTeamTacticalSchema | matchDate | matchWeek | aerials_lost | aerials_won | ... | shots_total | tackles | tackles_att_3rd | tackles_mid_3rd | through_balls | throw_ins | touches_att_3rd | touches_att_pen_area | touches_def_pen_area | championship_name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 14150 | 0 | Marseille | Marseille | 2 | Lille | (4-4-2) | 2021-10-03 | 9 | 0.0 | 2.0 | ... | 2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 10.0 | 5.0 | 1.0 | ligue 1 |
| 15110 | 0 | Marseille | Marseille | 2 | Lille | (4-4-2) | 2021-10-03 | 9 | 0.0 | 0.0 | ... | 1 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 29.0 | 1.0 | 0.0 | ligue 1 |
| 18643 | 0 | Marseille | Marseille | 2 | Lille | (4-4-2) | 2021-10-03 | 9 | 2.0 | 0.0 | ... | 0 | 1.0 | 1.0 | 0.0 | 0.0 | 8.0 | 18.0 | 1.0 | 7.0 | ligue 1 |
| 19562 | 0 | Marseille | Marseille | 2 | Lille | (4-4-2) | 2021-10-03 | 9 | 0.0 | 0.0 | ... | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 14.0 | 1.0 | 0.0 | ligue 1 |
| 24063 | 0 | Marseille | Marseille | 2 | Lille | (4-4-2) | 2021-10-03 | 9 | 0.0 | 0.0 | ... | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 7.0 | ligue 1 |
| 25665 | 0 | Marseille | Marseille | 2 | Lille | (4-4-2) | 2021-10-03 | 9 | 1.0 | 0.0 | ... | 2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 5.0 | 3.0 | 0.0 | ligue 1 |
| 27168 | 0 | Marseille | Marseille | 2 | Lille | (4-4-2) | 2021-10-03 | 9 | 1.0 | 2.0 | ... | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 3.0 | 13.0 | 1.0 | 0.0 | ligue 1 |
| 33450 | 0 | Marseille | Marseille | 2 | Lille | (4-4-2) | 2021-10-03 | 9 | 0.0 | 0.0 | ... | 0 | 1.0 | 0.0 | 1.0 | 0.0 | 6.0 | 26.0 | 2.0 | 0.0 | ligue 1 |
| 41038 | 0 | Marseille | Marseille | 2 | Lille | (4-4-2) | 2021-10-03 | 9 | 0.0 | 1.0 | ... | 0 | 6.0 | 0.0 | 1.0 | 0.0 | 0.0 | 9.0 | 0.0 | 9.0 | ligue 1 |
9 rows × 73 columns
#let's remove rows with teamNames as tactical schema
df = df[(df['awayTeamTacticalSchema'] != 'United') & (df['awayTeamTacticalSchema'] != 'Marseille')]
unique_values = df['awayTeamTacticalSchema'].unique()
print(unique_values)
['(4-4-2)' '(4-2-3-1)' '(3-5-2)' '(4-5-1)' '(3-4-1-2)' '(4-1-4-1)' '(4-3-3)' '(3-4-3)' '(5-1-2-2)' '(4-4-2◆)' '(4-4-1-1)' '(4-2-2-2)' '(3-5-1-1)' '(3-2-2-2-1)' '(3-2-3-1-1)' '(4-1-3-2)' '(3-2-2-1-2)' '(5-3-2)' '(5-4-1)' '(3-2-3-2)' '(4-2-3-1◆)' '(4-3-2-1)' '(3-1-4-1-1)' '(4-2-2-1-1)' '(4-3-1-2)' '(3-1-4-2)' '(3-4-3◆)' '(3-3-2-2)' '(3-2-4-1)' '(3-3-2-1-1)']
df.loc[df['awayTeamTacticalSchema'] == '(4-4-2◆)', 'awayTeamTacticalSchema'] = '(4-4-2)'
df.loc[df['awayTeamTacticalSchema'] == '(4-2-3-1◆)', 'awayTeamTacticalSchema'] = '(4-2-3-1)'
df.loc[df['awayTeamTacticalSchema'] == '(3-4-3◆)', 'awayTeamTacticalSchema'] = '(3-4-3)'
unique_values = df['awayTeamTacticalSchema'].unique()
print(unique_values)
['(4-4-2)' '(4-2-3-1)' '(3-5-2)' '(4-5-1)' '(3-4-1-2)' '(4-1-4-1)' '(4-3-3)' '(3-4-3)' '(5-1-2-2)' '(4-4-1-1)' '(4-2-2-2)' '(3-5-1-1)' '(3-2-2-2-1)' '(3-2-3-1-1)' '(4-1-3-2)' '(3-2-2-1-2)' '(5-3-2)' '(5-4-1)' '(3-2-3-2)' '(4-3-2-1)' '(3-1-4-1-1)' '(4-2-2-1-1)' '(4-3-1-2)' '(3-1-4-2)' '(3-3-2-2)' '(3-2-4-1)' '(3-3-2-1-1)']
Now the same for homeTeamTacticalSchema.
unique_values = df['homeTeamTacticalSchema'].unique()
print(unique_values)
['(4-3-3)' '(4-2-3-1)' '(4-4-2)' '(3-5-2)' '(4-1-4-1)' '(4-4-2◆)' '(3-4-3)' '(5-1-2-2)' '(3-5-1-1)' '(4-4-1-1)' '(3-4-1-2)' '(4-5-1)' '(3-1-4-2)' '(4-2-2-2)' '(5-3-2)' '(4-3-2-1)' '(3-2-4-1)' '(3-2-2-1-2)' '(3-3-2-2)' '(3-2-1-2-2)' '(3-4-3◆)' '(3-2-3-1-1)' '(3-2-2-2-1)' '(4-1-3-2)' '(3-2-3-2)' '(4-3-1-2)' '(5-4-1)' '(3-1-4-1-1)' '(4-2-2-1-1)' '(4-2-3-1◆)' '(3-5-2◆)' '(4-3-3◆)']
df.loc[df['homeTeamTacticalSchema'] == '(4-4-2◆)', 'homeTeamTacticalSchema'] = '(4-4-2)'
df.loc[df['homeTeamTacticalSchema'] == '(4-2-3-1◆)', 'homeTeamTacticalSchema'] = '(4-2-3-1)'
df.loc[df['homeTeamTacticalSchema'] == '(3-4-3◆)', 'homeTeamTacticalSchema'] = '(3-4-3)'
df.loc[df['homeTeamTacticalSchema'] == '(3-5-2◆)', 'homeTeamTacticalSchema'] = '(3-5-2)'
df.loc[df['homeTeamTacticalSchema'] == '(4-3-3◆)', 'homeTeamTacticalSchema'] = '(4-3-3)'
unique_values = df['homeTeamTacticalSchema'].unique()
print(unique_values)
['(4-3-3)' '(4-2-3-1)' '(4-4-2)' '(3-5-2)' '(4-1-4-1)' '(3-4-3)' '(5-1-2-2)' '(3-5-1-1)' '(4-4-1-1)' '(3-4-1-2)' '(4-5-1)' '(3-1-4-2)' '(4-2-2-2)' '(5-3-2)' '(4-3-2-1)' '(3-2-4-1)' '(3-2-2-1-2)' '(3-3-2-2)' '(3-2-1-2-2)' '(3-2-3-1-1)' '(3-2-2-2-1)' '(4-1-3-2)' '(3-2-3-2)' '(4-3-1-2)' '(5-4-1)' '(3-1-4-1-1)' '(4-2-2-1-1)']
We will transform these values into 3 columns: defense_schema (nr of players in defense), middle_schema, strikers_schema.
print(df['homeTeamTacticalSchema'].nunique())
27
def extract_numbers(formation):
nums = formation.strip('()').split('-')
nums = [int(num) for num in nums]
while len(nums) < 3:
nums.append(0)
return nums
formations_home = [extract_numbers(formation) for formation in df['homeTeamTacticalSchema']]
formations_home
[[4, 3, 3], [4, 2, 3, 1], [4, 3, 3], [4, 4, 2], [3, 5, 2], [3, 5, 2], [4, 3, 3], [4, 1, 4, 1], [4, 4, 2], [3, 5, 2], [4, 4, 2], [4, 2, 3, 1], [4, 3, 3], [3, 4, 3], [4, 3, 3], [4, 3, 3], [4, 4, 2], [4, 4, 2], [4, 2, 3, 1], [4, 2, 3, 1], [4, 1, 4, 1], [4, 2, 3, 1], [3, 5, 2], [4, 2, 3, 1], [4, 4, 2], [4, 3, 3], [3, 5, 2], [4, 2, 3, 1], [4, 4, 2], [4, 2, 3, 1], [3, 4, 3], [4, 2, 3, 1], [3, 4, 3], [4, 4, 2], [4, 2, 3, 1], [4, 3, 3], [3, 4, 3], [4, 4, 2], [4, 4, 2], [4, 1, 4, 1], [4, 3, 3], [4, 3, 3], [4, 1, 4, 1], [4, 2, 3, 1], [5, 1, 2, 2], [4, 3, 3], [4, 3, 3], [4, 3, 3], [3, 5, 1, 1], [3, 4, 3], [3, 5, 2], [3, 5, 2], [4, 4, 2], [3, 4, 3], [3, 4, 3], [4, 2, 3, 1], [4, 1, 4, 1], [3, 4, 3], [4, 3, 3], [4, 2, 3, 1], [4, 2, 3, 1], [4, 4, 1, 1], [4, 2, 3, 1], [4, 3, 3], [4, 2, 3, 1], [3, 4, 3], [4, 1, 4, 1], [3, 4, 3], [3, 5, 2], [4, 3, 3], [4, 2, 3, 1], [4, 1, 4, 1], [4, 4, 2], [3, 4, 3], [4, 2, 3, 1], [4, 4, 2], [4, 2, 3, 1], [3, 4, 3], [4, 1, 4, 1], [4, 4, 2], [4, 2, 3, 1], [4, 2, 3, 1], [4, 2, 3, 1], [3, 4, 1, 2], [4, 2, 3, 1], [4, 2, 3, 1], [4, 3, 3], [4, 2, 3, 1], [4, 1, 4, 1], [4, 1, 4, 1], [4, 2, 3, 1], [4, 4, 2], [4, 2, 3, 1], [3, 4, 3], [3, 5, 2], [4, 4, 1, 1], [4, 4, 1, 1], [4, 4, 2], [3, 4, 3], [4, 2, 3, 1], [4, 3, 3], [4, 2, 3, 1], [3, 4, 3], [4, 4, 1, 1], [4, 4, 2], [3, 5, 2], [4, 3, 3], [4, 1, 4, 1], [4, 4, 2], [4, 2, 3, 1], [4, 1, 4, 1], [4, 4, 2], [3, 4, 1, 2], [3, 4, 3], [4, 4, 1, 1], [3, 4, 3], [4, 3, 3], [3, 4, 3], [4, 3, 3], [3, 4, 1, 2], [4, 4, 1, 1], [4, 4, 2], [3, 5, 2], [4, 2, 3, 1], [3, 4, 1, 2], [4, 2, 3, 1], [4, 2, 3, 1], [3, 5, 2], [4, 1, 4, 1], [4, 2, 3, 1], [4, 4, 1, 1], [4, 4, 1, 1], [4, 1, 4, 1], [4, 3, 3], [4, 2, 3, 1], [4, 3, 3], [4, 2, 3, 1], [4, 3, 3], [4, 2, 3, 1], [4, 3, 3], [4, 5, 1], [3, 5, 2], [4, 4, 2], [4, 1, 4, 1], [4, 2, 3, 1], [4, 1, 4, 1], [4, 3, 3], [3, 5, 2], [3, 4, 3], [4, 1, 4, 1], [4, 4, 2], [4, 2, 3, 1], [3, 5, 2], [3, 5, 2], [4, 2, 3, 1], [4, 2, 3, 1], [4, 3, 3], [4, 3, 3], [3, 4, 3], [4, 3, 3], [4, 4, 2], [4, 4, 2], [4, 2, 3, 1], [3, 5, 2], [3, 4, 3], [4, 2, 3, 1], [4, 2, 3, 1], [4, 2, 3, 1], [4, 2, 3, 1], [3, 5, 2], [4, 3, 3], [4, 4, 2], [4, 5, 1], [4, 2, 3, 1], [4, 4, 2], [3, 5, 2], [4, 4, 2], [4, 3, 3], [4, 4, 1, 1], [4, 3, 3], [4, 3, 3], [4, 4, 2], [3, 1, 4, 2], [3, 5, 2], [3, 5, 2], [4, 3, 3], [4, 4, 1, 1], [4, 2, 3, 1], [3, 4, 1, 2], [4, 3, 3], [4, 3, 3], [4, 4, 2], [4, 4, 2], [4, 2, 2, 2], [4, 3, 3], [4, 4, 2], [3, 5, 2], [3, 5, 2], [5, 3, 2], [4, 4, 2], [4, 2, 3, 1], [4, 2, 3, 1], [3, 4, 3], [4, 4, 2], [4, 4, 2], [4, 3, 3], [4, 3, 2, 1], [4, 2, 3, 1], [3, 5, 2], [4, 2, 3, 1], [4, 3, 3], [3, 4, 3], [3, 4, 1, 2], [3, 4, 3], [3, 4, 3], [3, 5, 2], [3, 4, 3], [3, 4, 3], [3, 4, 3], [4, 4, 2], [4, 2, 3, 1], [4, 2, 3, 1], [4, 2, 3, 1], [4, 4, 2], [4, 4, 2], [4, 4, 2], [3, 5, 2], [3, 5, 2], [4, 3, 3], [4, 4, 2], [3, 4, 3], [4, 2, 3, 1], [4, 4, 2], [4, 2, 3, 1], [3, 4, 1, 2], [4, 2, 3, 1], [4, 1, 4, 1], [4, 2, 3, 1], [4, 3, 3], [4, 4, 2], [4, 4, 2], [3, 5, 2], [4, 3, 3], [4, 3, 3], [4, 2, 3, 1], [3, 4, 3], [3, 5, 2], [4, 2, 3, 1], [3, 4, 3], [4, 4, 2], [4, 2, 3, 1], [4, 4, 2], [3, 5, 2], [4, 2, 3, 1], [4, 1, 4, 1], [3, 5, 2], [4, 2, 3, 1], [4, 2, 3, 1], [4, 2, 3, 1], [4, 2, 3, 1], [3, 5, 2], [4, 3, 3], [4, 3, 3], [3, 5, 2], [4, 2, 3, 1], [3, 5, 2], [4, 2, 3, 1], [4, 1, 4, 1], [4, 4, 2], [4, 4, 2], [4, 4, 2], [4, 4, 2], [4, 4, 2], [3, 4, 3], [4, 2, 3, 1], [4, 1, 4, 1], [4, 3, 3], [4, 2, 3, 1], [3, 1, 4, 2], [4, 4, 1, 1], [4, 3, 3], [4, 5, 1], [3, 5, 2], [4, 3, 3], [3, 5, 2], [4, 2, 3, 1], [3, 4, 3], [4, 2, 3, 1], [3, 5, 1, 1], [4, 4, 2], [4, 3, 3], [4, 4, 2], [3, 4, 3], [4, 2, 3, 1], [4, 2, 3, 1], [4, 2, 2, 2], [4, 1, 4, 1], [3, 4, 1, 2], [4, 4, 2], [4, 2, 3, 1], [4, 3, 3], [3, 4, 3], [4, 2, 3, 1], [4, 2, 3, 1], [4, 3, 3], [4, 2, 3, 1], [4, 4, 2], [4, 3, 3], [4, 4, 2], [3, 2, 4, 1], [4, 4, 2], [4, 3, 3], [3, 5, 2], [3, 5, 2], [4, 3, 3], [4, 2, 2, 2], [4, 2, 3, 1], [4, 1, 4, 1], [4, 3, 3], [4, 1, 4, 1], [4, 2, 3, 1], [4, 4, 2], [4, 2, 3, 1], [4, 2, 3, 1], [4, 5, 1], [4, 2, 3, 1], [3, 5, 2], [4, 2, 3, 1], [3, 4, 1, 2], [4, 2, 3, 1], [4, 2, 3, 1], [4, 2, 3, 1], [4, 3, 3], [4, 3, 3], [4, 2, 3, 1], [3, 5, 2], [4, 4, 2], [3, 1, 4, 2], [4, 2, 3, 1], [4, 2, 3, 1], [4, 4, 2], [4, 2, 3, 1], [4, 2, 2, 2], [3, 5, 2], [4, 4, 2], [3, 4, 3], [4, 3, 3], [3, 5, 2], [4, 3, 3], [4, 1, 4, 1], [3, 4, 3], [3, 5, 2], [4, 1, 4, 1], [4, 3, 3], [4, 3, 3], [3, 4, 3], [3, 5, 2], [4, 2, 3, 1], [4, 3, 3], [3, 4, 3], [3, 5, 2], [4, 3, 3], [3, 5, 2], [4, 3, 3], [3, 4, 3], [4, 4, 1, 1], [5, 3, 2], [3, 4, 3], [4, 2, 3, 1], [4, 2, 3, 1], [3, 4, 1, 2], [3, 4, 3], [4, 3, 3], [3, 4, 3], [3, 4, 3], [4, 4, 2], [3, 5, 2], [4, 4, 2], [4, 2, 3, 1], [4, 2, 3, 1], [3, 5, 2], [4, 4, 2], [4, 4, 2], [4, 2, 3, 1], [4, 3, 3], [4, 4, 2], [4, 3, 3], [3, 5, 2], [4, 3, 3], [3, 4, 3], [3, 4, 1, 2], [4, 2, 3, 1], [3, 4, 1, 2], [4, 3, 3], [4, 4, 2], [4, 3, 3], [4, 1, 4, 1], [4, 2, 3, 1], [3, 5, 2], [4, 2, 3, 1], [4, 3, 3], [4, 4, 2], [4, 2, 3, 1], [3, 4, 3], [4, 4, 2], [3, 2, 2, 1, 2], [4, 3, 3], [3, 4, 3], [4, 2, 3, 1], [3, 5, 2], [4, 3, 3], [3, 4, 3], [4, 2, 3, 1], [3, 5, 2], [3, 5, 2], [4, 4, 2], [4, 2, 3, 1], [3, 5, 2], [4, 3, 3], [4, 4, 2], [4, 4, 2], [4, 1, 4, 1], [4, 4, 2], [4, 2, 3, 1], [3, 4, 1, 2], [4, 1, 4, 1], [4, 4, 2], [3, 4, 1, 2], [4, 4, 2], [4, 2, 3, 1], [3, 4, 3], [4, 3, 3], [4, 4, 2], [4, 4, 1, 1], [4, 4, 2], [4, 1, 4, 1], [3, 5, 2], [4, 4, 1, 1], [4, 2, 3, 1], [4, 2, 3, 1], [4, 2, 3, 1], [3, 5, 2], [4, 4, 2], [3, 5, 2], [4, 2, 3, 1], [3, 5, 2], [3, 5, 2], [3, 4, 1, 2], [4, 4, 2], [3, 5, 2], [4, 4, 2], [4, 4, 2], [4, 4, 2], [4, 4, 2], [3, 4, 3], [3, 4, 3], [4, 3, 3], [3, 5, 2], [4, 2, 3, 1], [4, 4, 2], [3, 4, 1, 2], [4, 3, 3], [3, 5, 2], [4, 2, 3, 1], [4, 4, 2], [4, 3, 3], [3, 5, 2], [4, 4, 1, 1], [4, 2, 3, 1], [3, 4, 3], [3, 5, 2], [4, 3, 3], [4, 1, 4, 1], [4, 4, 1, 1], [4, 4, 2], [4, 3, 3], [4, 3, 3], [4, 2, 3, 1], [4, 2, 3, 1], [4, 3, 3], [4, 4, 1, 1], [4, 3, 3], [4, 4, 2], [4, 5, 1], [4, 2, 3, 1], [4, 4, 2], [4, 4, 1, 1], [4, 4, 2], [4, 3, 3], [3, 4, 1, 2], [3, 5, 2], [3, 4, 3], [4, 3, 3], [4, 4, 2], [3, 4, 3], [4, 3, 3], [4, 2, 3, 1], [4, 1, 4, 1], [4, 4, 2], [4, 2, 3, 1], [3, 4, 3], [3, 5, 2], [3, 4, 3], [3, 5, 2], [4, 2, 3, 1], [4, 2, 3, 1], [4, 1, 4, 1], [4, 3, 3], [4, 2, 3, 1], [4, 3, 3], [4, 3, 3], [4, 2, 3, 1], [3, 5, 2], [4, 3, 3], [4, 4, 2], [3, 4, 3], [4, 3, 3], [3, 4, 1, 2], [4, 4, 1, 1], [3, 4, 3], [4, 3, 3], [3, 4, 3], [4, 3, 3], [3, 4, 3], [4, 3, 3], [3, 4, 3], [4, 4, 2], [4, 4, 2], [3, 3, 2, 2], [4, 2, 3, 1], [4, 2, 3, 1], [4, 2, 3, 1], [3, 5, 2], [4, 4, 2], [4, 2, 3, 1], [3, 4, 1, 2], [3, 5, 2], [4, 4, 1, 1], [4, 2, 3, 1], [3, 4, 3], [3, 1, 4, 2], [3, 5, 2], [3, 4, 3], [4, 3, 3], [4, 1, 4, 1], [4, 2, 3, 1], [3, 4, 3], [4, 3, 3], [4, 3, 3], [3, 5, 1, 1], [4, 2, 3, 1], [4, 2, 3, 1], [3, 2, 2, 1, 2], [3, 4, 3], [3, 5, 2], [4, 4, 2], [4, 3, 3], [4, 2, 3, 1], [4, 4, 2], [4, 2, 3, 1], [4, 3, 3], [4, 2, 3, 1], [4, 3, 3], [4, 4, 2], [4, 2, 3, 1], [3, 4, 3], [4, 2, 3, 1], [4, 4, 2], [3, 4, 3], [4, 3, 3], [3, 2, 1, 2, 2], [4, 4, 2], [4, 4, 1, 1], [4, 2, 3, 1], [4, 4, 2], [4, 4, 2], [4, 4, 2], [4, 4, 2], [4, 1, 4, 1], [4, 4, 2], [4, 2, 3, 1], [3, 5, 2], [4, 1, 4, 1], [3, 4, 3], [4, 4, 2], [3, 5, 2], [4, 4, 2], [4, 2, 3, 1], [4, 2, 3, 1], [4, 4, 2], [4, 5, 1], [3, 4, 3], [4, 2, 3, 1], [4, 4, 2], [4, 2, 3, 1], [4, 4, 2], [3, 5, 2], [4, 4, 2], [4, 3, 3], [4, 4, 2], [4, 4, 2], [4, 4, 2], [4, 1, 4, 1], [4, 4, 2], [3, 5, 2], [4, 2, 3, 1], [4, 4, 2], [4, 1, 4, 1], [4, 4, 2], [3, 5, 2], [4, 4, 2], [4, 1, 4, 1], [4, 3, 3], [4, 1, 4, 1], [4, 4, 2], [4, 2, 3, 1], [4, 4, 2], [3, 5, 2], [4, 3, 3], [4, 4, 2], [4, 1, 4, 1], [3, 4, 3], [4, 2, 3, 1], [3, 4, 3], [4, 3, 3], [3, 5, 2], [4, 3, 3], [4, 3, 3], [4, 3, 3], [3, 4, 3], [4, 3, 3], [4, 3, 3], [4, 3, 3], [3, 5, 2], [4, 2, 3, 1], [4, 4, 2], [3, 4, 1, 2], [4, 2, 3, 1], [4, 2, 3, 1], [4, 4, 2], [3, 5, 2], [4, 3, 3], [3, 4, 1, 2], [3, 5, 2], [4, 1, 4, 1], [3, 4, 3], [4, 3, 3], [4, 1, 4, 1], [4, 2, 3, 1], [4, 4, 2], [4, 3, 2, 1], [4, 4, 2], [4, 2, 3, 1], [4, 4, 2], [3, 5, 2], [3, 4, 1, 2], [3, 4, 3], [3, 5, 2], [4, 4, 2], [4, 4, 2], [3, 5, 2], [4, 2, 3, 1], [4, 4, 2], [4, 3, 3], [4, 4, 2], [4, 4, 2], [3, 4, 3], [4, 1, 4, 1], [3, 4, 3], [4, 5, 1], [4, 3, 3], [4, 4, 2], [4, 2, 2, 2], [3, 5, 2], [4, 4, 2], [4, 2, 2, 2], [4, 3, 3], [3, 5, 2], [4, 2, 3, 1], [4, 2, 3, 1], [4, 2, 3, 1], [3, 4, 3], [4, 3, 3], [4, 3, 3], [3, 4, 3], [3, 4, 3], [4, 4, 2], [4, 2, 3, 1], [3, 4, 1, 2], [4, 5, 1], [3, 5, 2], [4, 3, 3], [3, 4, 3], [4, 2, 3, 1], [4, 3, 3], [4, 5, 1], [4, 3, 3], [3, 5, 2], [3, 5, 2], [4, 2, 3, 1], [3, 5, 2], [4, 2, 3, 1], [4, 1, 4, 1], [3, 4, 3], [4, 3, 3], [3, 5, 2], [3, 5, 2], [4, 1, 4, 1], [3, 4, 3], [4, 3, 3], [3, 4, 1, 2], [4, 4, 2], [4, 2, 3, 1], [3, 4, 3], [4, 2, 3, 1], [4, 2, 3, 1], [4, 4, 2], [4, 2, 3, 1], [3, 4, 3], [4, 2, 3, 1], [4, 3, 3], [3, 4, 3], [3, 4, 3], [4, 4, 2], [3, 4, 3], [4, 3, 3], [4, 4, 2], [4, 4, 2], [4, 4, 2], [4, 2, 3, 1], [4, 4, 1, 1], [4, 1, 4, 1], [4, 3, 3], [4, 3, 3], [3, 4, 3], [4, 1, 4, 1], [3, 5, 2], [3, 4, 1, 2], [4, 3, 3], [4, 3, 3], [4, 2, 3, 1], [4, 2, 3, 1], [4, 2, 3, 1], [4, 4, 1, 1], [3, 4, 3], [4, 4, 2], [4, 2, 3, 1], [4, 4, 2], [3, 4, 3], [4, 1, 4, 1], [4, 2, 3, 1], [4, 1, 4, 1], [4, 4, 2], [3, 4, 3], [4, 4, 2], [4, 1, 4, 1], [3, 4, 3], [3, 5, 2], [3, 4, 1, 2], [3, 4, 3], [4, 4, 2], [3, 4, 3], [3, 4, 1, 2], [4, 4, 2], [4, 4, 2], [4, 2, 3, 1], [3, 5, 2], [3, 5, 2], [4, 4, 2], [4, 2, 3, 1], [4, 3, 3], [3, 5, 2], [4, 2, 3, 1], [4, 3, 3], [3, 5, 2], [3, 4, 1, 2], [4, 2, 3, 1], [3, 4, 3], [4, 1, 4, 1], [4, 4, 2], [4, 4, 2], [4, 4, 2], [3, 5, 2], [3, 4, 1, 2], [4, 1, 4, 1], [4, 4, 2], [3, 5, 2], [3, 4, 3], [4, 2, 3, 1], [3, 4, 3], [4, 3, 3], [4, 3, 3], [3, 5, 2], [4, 2, 3, 1], [4, 1, 4, 1], [4, 4, 2], [3, 5, 2], [3, 4, 1, 2], [4, 1, 4, 1], [4, 3, 3], [3, 4, 3], [4, 3, 3], [4, 4, 2], [4, 2, 3, 1], [3, 4, 3], [4, 4, 2], [4, 2, 3, 1], [4, 2, 3, 1], [4, 4, 1, 1], [4, 2, 3, 1], [4, 3, 3], [3, 4, 3], [3, 5, 2], [4, 4, 2], [4, 2, 3, 1], [4, 4, 2], [4, 1, 4, 1], [4, 2, 3, 1], [4, 5, 1], [4, 3, 3], [4, 4, 2], [3, 5, 2], [3, 5, 2], [4, 4, 1, 1], [4, 2, 3, 1], [3, 5, 2], [3, 2, 3, 1, 1], [4, 4, 1, 1], [4, 3, 3], [4, 4, 2], [3, 4, 3], [3, 4, 3], [3, 5, 1, 1], [4, 4, 2], [4, 2, 3, 1], [3, 5, 2], [4, 3, 3], [4, 2, 2, 2], [4, 3, 3], [4, 2, 3, 1], [4, 3, 3], [4, 2, 3, 1], [4, 1, 4, 1], [4, 4, 2], [4, 3, 3], [3, 4, 3], [4, 3, 3], [4, 2, 3, 1], [4, 3, 3], [4, 4, 2], [4, 2, 3, 1], [4, 2, 3, 1], [3, 4, 3], [4, 4, 2], [3, 5, 2], [4, 4, 1, 1], [4, 2, 3, 1], [3, 2, 2, 1, 2], [4, 4, 2], [4, 2, 3, 1], [3, 4, 3], [3, 4, 3], [3, 5, 2], [4, 2, 3, 1], [4, 3, 3], [4, 4, 2], [3, 4, 3], [4, 2, 3, 1], [4, 3, 2, 1], [4, 3, 3], [4, 2, 3, 1], [3, 5, 2], [4, 2, 3, 1], [4, 3, 3], [3, 4, 1, 2], [4, 3, 3], [4, 4, 2], [3, 5, 2], [4, 1, 4, 1], [4, 4, 2], [3, 1, 4, 2], [4, 4, 2], [4, 3, 3], [3, 4, 3], [3, 5, 2], [3, 4, 3], [4, 3, 3], [4, 4, 2], [4, 1, 4, 1], [3, 5, 2], [4, 1, 4, 1], [4, 2, 3, 1], [4, 4, 1, 1], [4, 3, 3], [3, 5, 2], [4, 2, 3, 1], [3, 5, 2], [3, 5, 2], [4, 2, 3, 1], [4, 4, 1, 1], [4, 2, 3, 1], [4, 4, 2], [4, 4, 2], [4, 1, 4, 1], [4, 3, 3], [3, 4, 3], [4, 2, 3, 1], [3, 4, 1, 2], [4, 2, 3, 1], [3, 4, 3], [3, 5, 2], [4, 2, 3, 1], [4, 4, 2], [4, 2, 3, 1], [3, 5, 1, 1], [4, 5, 1], [4, 3, 3], [4, 4, 1, 1], [4, 3, 3], [4, 3, 3], [4, 2, 3, 1], [4, 4, 2], [4, 2, 3, 1], [4, 4, 2], [3, 5, 2], [4, 3, 3], [4, 3, 3], [4, 3, 3], [3, 5, 2], [4, 1, 4, 1], [3, 4, 3], [4, 2, 3, 1], [4, 4, 2], [4, 2, 3, 1], [4, 2, 2, 2], [4, 3, 3], [4, 2, 3, 1], [4, 4, 2], [4, 1, 4, 1], [4, 3, 3], [3, 4, 3], [4, 2, 3, 1], [4, 2, 3, 1], [4, 3, 3], [3, 4, 3], [4, 2, 3, 1], [4, 4, 2], [3, 4, 3], [4, 2, 3, 1], [4, 2, 3, 1], [4, 4, 2], [4, 2, 3, 1], [3, 4, 3], [4, 4, 2], [4, 1, 4, 1], [4, 2, 3, 1], [4, 2, 3, 1], [3, 5, 2], [4, 3, 3], [4, 2, 3, 1], [4, 3, 3], [4, 1, 4, 1], [3, 4, 3], [4, 2, 2, 2], [4, 4, 2], [3, 4, 3], [3, 5, 2], [3, 5, 2], [4, 2, 3, 1], [3, 5, 2], [3, 4, 1, 2], [3, 5, 1, 1], [4, 1, 4, 1], [3, 4, 1, 2], [4, 3, 3], [3, 4, 3], [3, 5, 2], [3, 4, 1, 2], [4, 1, 4, 1], [4, 2, 3, 1], [3, 5, 2], [3, 1, 4, 2], [4, 2, 3, 1], [4, 4, 2], [4, 4, 2], [4, 4, 2], [4, 3, 3], [3, 2, 2, 2, 1], [3, 5, 2], [4, 2, 3, 1], [4, 3, 3], [4, 2, 2, 2], [4, 2, 3, 1], [3, 4, 3], [4, 2, 3, 1], [4, 2, 3, 1], [4, 4, 2], [4, 1, 4, 1], [3, 5, 2], [4, 4, 2], ...]
#count nr of middle players
def sum_middle_values(formation):
return sum(formation[1:-1])
df['defense_schema_home'] = [form[0] for form in formations_home]
df['middle_schema_home'] = [sum_middle_values(form) for form in formations_home]
df['strikers_schema_home'] = [form[-1] for form in formations_home]
df.drop(columns=['homeTeamTacticalSchema'], inplace=True)
df.head()
| awayScore | awayTeamName | awayTeamTacticalSchema | homeScore | homeTeamName | matchDate | matchWeek | aerials_lost | aerials_won | age | ... | tackles_mid_3rd | through_balls | throw_ins | touches_att_3rd | touches_att_pen_area | touches_def_pen_area | championship_name | defense_schema_home | middle_schema_home | strikers_schema_home | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4 | Napoli | (4-4-2) | 1 | Spezia | 2021-05-08 | 35 | 0.0 | 0.0 | 24.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 42.0 | seria a | 4 | 3 | 3 |
| 1 | 4 | Bayern Munich | (4-2-3-1) | 0 | Schalke 04 | 2021-01-24 | 18 | 0.0 | 1.0 | 20.0 | ... | 3.0 | 0.0 | 4.0 | 20.0 | 0.0 | 4.0 | bundesliga | 4 | 5 | 1 |
| 2 | 0 | Osasuna | (3-5-2) | 0 | Levante | 2021-12-05 | 16 | 0.0 | 1.0 | 23.0 | ... | 2.0 | 0.0 | 0.0 | 14.0 | 0.0 | 3.0 | liga | 4 | 3 | 3 |
| 3 | 2 | Crystal Palace | (4-5-1) | 2 | Arsenal | 2019-10-27 | 10 | 0.0 | 1.0 | 28.0 | ... | 1.0 | 0.0 | 0.0 | 9.0 | 1.0 | 6.0 | premier league | 4 | 4 | 2 |
| 4 | 0 | Hertha BSC | (4-2-3-1) | 2 | Union Berlin | 2021-11-20 | 12 | 3.0 | 0.0 | 24.0 | ... | 0.0 | 0.0 | 0.0 | 16.0 | 2.0 | 1.0 | bundesliga | 3 | 5 | 2 |
5 rows × 75 columns
print(df['awayTeamTacticalSchema'].nunique())
27
formations_away = [extract_numbers(formation) for formation in df['awayTeamTacticalSchema']]
formations_away
[[4, 4, 2], [4, 2, 3, 1], [3, 5, 2], [4, 5, 1], [4, 2, 3, 1], [3, 4, 1, 2], [4, 2, 3, 1], [4, 4, 2], [3, 5, 2], [4, 4, 2], [4, 2, 3, 1], [4, 1, 4, 1], [4, 2, 3, 1], [4, 3, 3], [4, 2, 3, 1], [4, 4, 2], [3, 5, 2], [4, 4, 2], [4, 4, 2], [4, 3, 3], [4, 2, 3, 1], [4, 4, 2], [3, 4, 3], [3, 4, 1, 2], [4, 2, 3, 1], [4, 1, 4, 1], [3, 5, 2], [4, 4, 2], [4, 4, 2], [4, 4, 2], [4, 3, 3], [3, 4, 1, 2], [3, 4, 3], [4, 4, 2], [4, 2, 3, 1], [4, 3, 3], [3, 4, 3], [4, 2, 3, 1], [3, 4, 1, 2], [3, 5, 2], [4, 2, 3, 1], [4, 4, 2], [4, 2, 3, 1], [4, 2, 3, 1], [5, 1, 2, 2], [4, 2, 3, 1], [4, 2, 3, 1], [4, 4, 2], [4, 2, 3, 1], [4, 2, 3, 1], [3, 4, 1, 2], [3, 5, 2], [3, 4, 3], [4, 2, 3, 1], [3, 5, 2], [3, 5, 2], [4, 1, 4, 1], [3, 4, 3], [4, 3, 3], [4, 3, 3], [4, 1, 4, 1], [4, 4, 2], [4, 3, 3], [3, 5, 2], [4, 4, 2], [3, 5, 2], [4, 4, 2], [3, 4, 3], [4, 3, 3], [4, 4, 1, 1], [4, 2, 3, 1], [4, 4, 2], [4, 3, 3], [3, 4, 3], [3, 5, 2], [3, 4, 3], [3, 4, 3], [3, 4, 3], [4, 2, 2, 2], [4, 3, 3], [3, 4, 1, 2], [3, 5, 2], [3, 4, 1, 2], [4, 4, 2], [4, 1, 4, 1], [3, 5, 2], [4, 4, 2], [4, 3, 3], [3, 5, 2], [4, 2, 3, 1], [3, 4, 3], [4, 3, 3], [3, 5, 2], [3, 4, 3], [3, 4, 3], [4, 3, 3], [4, 2, 3, 1], [4, 4, 2], [3, 4, 1, 2], [4, 2, 3, 1], [4, 4, 2], [3, 5, 2], [4, 2, 3, 1], [4, 5, 1], [3, 4, 3], [4, 4, 2], [4, 4, 2], [3, 5, 2], [3, 4, 3], [3, 4, 1, 2], [4, 4, 1, 1], [4, 4, 2], [4, 2, 3, 1], [4, 4, 2], [4, 2, 3, 1], [4, 2, 3, 1], [3, 4, 1, 2], [4, 4, 2], [3, 4, 3], [3, 4, 1, 2], [3, 4, 3], [3, 5, 1, 1], [3, 4, 3], [3, 4, 3], [4, 2, 3, 1], [4, 3, 3], [4, 4, 2], [3, 5, 2], [4, 2, 3, 1], [4, 3, 3], [4, 4, 2], [4, 4, 2], [3, 5, 2], [3, 5, 2], [4, 4, 2], [4, 4, 2], [4, 4, 1, 1], [3, 4, 1, 2], [4, 3, 3], [3, 5, 1, 1], [4, 4, 2], [4, 2, 3, 1], [4, 3, 3], [4, 3, 3], [4, 4, 2], [4, 4, 2], [4, 3, 3], [4, 4, 2], [3, 5, 2], [3, 5, 2], [4, 3, 3], [4, 3, 3], [3, 5, 2], [3, 4, 1, 2], [4, 4, 2], [4, 3, 3], [3, 5, 1, 1], [4, 1, 4, 1], [4, 3, 3], [4, 4, 2], [4, 2, 3, 1], [3, 4, 3], [4, 2, 3, 1], [4, 4, 2], [3, 5, 2], [4, 1, 4, 1], [3, 5, 2], [4, 3, 3], [4, 2, 3, 1], [4, 4, 2], [4, 2, 3, 1], [3, 4, 1, 2], [3, 4, 1, 2], [4, 4, 2], [4, 1, 4, 1], [4, 4, 2], [3, 4, 1, 2], [3, 5, 2], [3, 4, 3], [4, 2, 3, 1], [3, 4, 3], [3, 4, 1, 2], [3, 2, 2, 2, 1], [3, 4, 3], [3, 4, 3], [4, 1, 4, 1], [4, 2, 3, 1], [4, 3, 3], [4, 2, 3, 1], [4, 3, 3], [4, 2, 3, 1], [4, 1, 4, 1], [3, 4, 1, 2], [4, 3, 3], [4, 4, 2], [3, 2, 3, 1, 1], [4, 3, 3], [3, 5, 2], [4, 1, 3, 2], [4, 1, 4, 1], [4, 1, 4, 1], [4, 4, 2], [4, 4, 2], [4, 4, 2], [4, 1, 4, 1], [4, 2, 3, 1], [3, 4, 3], [4, 4, 2], [4, 3, 3], [4, 2, 3, 1], [4, 4, 1, 1], [4, 4, 2], [4, 4, 2], [4, 3, 3], [3, 5, 2], [4, 4, 1, 1], [4, 2, 3, 1], [4, 2, 2, 2], [4, 4, 1, 1], [4, 3, 3], [3, 4, 3], [4, 2, 3, 1], [4, 4, 2], [4, 4, 2], [4, 4, 2], [4, 4, 2], [3, 5, 2], [4, 2, 3, 1], [4, 2, 3, 1], [4, 5, 1], [3, 4, 3], [4, 1, 4, 1], [4, 2, 3, 1], [3, 4, 3], [3, 5, 2], [4, 3, 3], [4, 3, 3], [4, 4, 2], [4, 4, 1, 1], [4, 2, 3, 1], [4, 1, 4, 1], [3, 5, 2], [4, 4, 2], [4, 4, 2], [3, 5, 2], [3, 4, 3], [3, 5, 2], [4, 4, 2], [3, 5, 2], [4, 4, 2], [3, 4, 3], [3, 5, 2], [4, 2, 3, 1], [4, 2, 3, 1], [4, 4, 2], [4, 4, 2], [4, 2, 3, 1], [3, 4, 1, 2], [3, 4, 1, 2], [4, 2, 3, 1], [4, 3, 3], [4, 2, 3, 1], [4, 2, 3, 1], [4, 3, 3], [4, 1, 4, 1], [4, 3, 3], [4, 3, 3], [4, 2, 3, 1], [4, 2, 3, 1], [4, 2, 3, 1], [4, 2, 3, 1], [3, 5, 2], [4, 4, 2], [4, 2, 3, 1], [4, 5, 1], [4, 4, 2], [3, 4, 1, 2], [4, 1, 4, 1], [3, 2, 2, 1, 2], [4, 2, 3, 1], [4, 4, 1, 1], [4, 4, 2], [4, 2, 3, 1], [4, 2, 3, 1], [4, 4, 2], [3, 5, 2], [3, 5, 2], [3, 4, 3], [4, 4, 2], [4, 3, 3], [3, 5, 2], [3, 4, 1, 2], [3, 5, 2], [4, 1, 4, 1], [4, 2, 3, 1], [3, 2, 2, 2, 1], [4, 4, 2], [3, 5, 2], [4, 4, 2], [4, 3, 3], [4, 2, 3, 1], [3, 5, 2], [4, 1, 4, 1], [3, 4, 3], [3, 5, 2], [3, 4, 1, 2], [4, 2, 3, 1], [4, 4, 2], [3, 4, 3], [5, 3, 2], [4, 3, 3], [4, 4, 2], [4, 3, 3], [4, 4, 2], [4, 3, 3], [3, 4, 3], [3, 5, 2], [4, 2, 3, 1], [4, 4, 2], [4, 2, 2, 2], [4, 1, 4, 1], [3, 5, 2], [4, 2, 3, 1], [4, 3, 3], [4, 2, 2, 2], [4, 5, 1], [4, 4, 2], [4, 4, 2], [4, 1, 4, 1], [4, 2, 3, 1], [4, 2, 3, 1], [3, 5, 2], [4, 2, 3, 1], [4, 3, 3], [4, 3, 3], [4, 3, 3], [4, 3, 3], [4, 4, 2], [4, 2, 3, 1], [4, 4, 2], [4, 2, 3, 1], [4, 4, 2], [4, 3, 3], [4, 2, 3, 1], [4, 3, 3], [3, 5, 2], [4, 2, 3, 1], [4, 3, 3], [3, 4, 3], [4, 3, 3], [4, 2, 3, 1], [4, 3, 3], [4, 4, 2], [4, 1, 4, 1], [4, 4, 2], [3, 4, 3], [4, 3, 3], [4, 4, 2], [4, 2, 3, 1], [4, 1, 4, 1], [3, 4, 3], [4, 2, 3, 1], [4, 4, 2], [4, 2, 3, 1], [4, 4, 2], [3, 4, 1, 2], [3, 4, 1, 2], [4, 2, 3, 1], [4, 4, 2], [4, 1, 3, 2], [3, 4, 3], [4, 3, 3], [4, 2, 3, 1], [4, 2, 3, 1], [3, 4, 3], [4, 2, 3, 1], [3, 5, 2], [4, 4, 2], [3, 4, 1, 2], [3, 4, 3], [3, 4, 1, 2], [3, 4, 1, 2], [3, 4, 3], [3, 4, 3], [4, 4, 2], [3, 5, 2], [4, 4, 2], [4, 4, 2], [3, 4, 1, 2], [4, 1, 4, 1], [4, 4, 2], [4, 4, 2], [4, 3, 3], [4, 2, 3, 1], [4, 2, 3, 1], [4, 2, 3, 1], [4, 2, 3, 1], [4, 4, 2], [3, 5, 2], [4, 3, 3], [4, 4, 2], [4, 1, 4, 1], [4, 4, 2], [3, 5, 2], [4, 4, 2], [4, 1, 4, 1], [4, 4, 2], [3, 4, 1, 2], [3, 5, 2], [4, 4, 2], [4, 1, 4, 1], [5, 4, 1], [4, 1, 4, 1], [3, 5, 2], [4, 2, 3, 1], [5, 1, 2, 2], [4, 1, 4, 1], [3, 5, 2], [3, 5, 2], [5, 1, 2, 2], [3, 2, 3, 2], [4, 4, 2], [3, 5, 2], [4, 2, 3, 1], [4, 4, 2], [4, 3, 3], [4, 4, 2], [3, 4, 3], [3, 4, 1, 2], [3, 4, 1, 2], [4, 3, 3], [4, 4, 2], [4, 4, 2], [4, 3, 3], [4, 1, 4, 1], [4, 4, 1, 1], [4, 4, 2], [4, 3, 3], [4, 4, 2], [4, 1, 4, 1], [4, 3, 3], [4, 3, 3], [4, 2, 3, 1], [3, 4, 3], [4, 3, 3], [4, 2, 3, 1], [3, 4, 3], [4, 2, 3, 1], [3, 4, 3], [4, 3, 3], [4, 1, 4, 1], [4, 3, 3], [3, 5, 2], [4, 3, 3], [4, 2, 3, 1], [4, 1, 4, 1], [4, 2, 3, 1], [4, 2, 3, 1], [3, 5, 2], [4, 4, 2], [3, 4, 3], [4, 4, 2], [4, 2, 3, 1], [4, 1, 4, 1], [4, 3, 3], [4, 4, 2], [3, 4, 3], [4, 3, 3], [4, 3, 3], [3, 4, 1, 2], [3, 5, 2], [3, 5, 2], [4, 4, 2], [3, 4, 3], [4, 4, 2], [3, 5, 2], [4, 2, 3, 1], [5, 3, 2], [4, 4, 2], [3, 5, 2], [4, 2, 3, 1], [3, 5, 2], [4, 4, 2], [4, 3, 3], [4, 1, 4, 1], [3, 5, 2], [4, 4, 2], [3, 4, 1, 2], [4, 3, 3], [4, 3, 3], [4, 4, 2], [4, 4, 2], [3, 4, 1, 2], [3, 4, 1, 2], [3, 5, 2], [4, 4, 2], [3, 4, 3], [4, 2, 3, 1], [4, 3, 3], [4, 4, 2], [4, 4, 2], [3, 5, 2], [4, 5, 1], [3, 5, 2], [4, 4, 2], [3, 4, 3], [4, 2, 3, 1], [4, 4, 2], [4, 4, 1, 1], [4, 4, 1, 1], [3, 5, 2], [4, 2, 3, 1], [4, 3, 2, 1], [4, 2, 3, 1], [4, 4, 2], [4, 4, 2], [3, 5, 2], [4, 4, 2], [4, 2, 2, 2], [3, 5, 2], [4, 2, 3, 1], [4, 3, 3], [4, 4, 2], [4, 1, 4, 1], [4, 2, 3, 1], [3, 5, 2], [3, 4, 3], [4, 2, 3, 1], [4, 1, 4, 1], [3, 4, 3], [4, 4, 2], [4, 2, 3, 1], [4, 1, 4, 1], [3, 5, 2], [4, 4, 2], [3, 4, 3], [4, 4, 2], [3, 5, 2], [4, 1, 4, 1], [3, 4, 3], [4, 1, 4, 1], [4, 4, 2], [4, 3, 3], [4, 3, 3], [3, 4, 1, 2], [4, 2, 3, 1], [3, 4, 3], [4, 4, 2], [3, 5, 2], [4, 1, 4, 1], [4, 4, 1, 1], [4, 4, 2], [4, 2, 3, 1], [4, 3, 3], [4, 2, 3, 1], [4, 2, 3, 1], [4, 4, 2], [4, 1, 4, 1], [4, 3, 3], [4, 4, 2], [3, 4, 3], [4, 2, 3, 1], [4, 1, 4, 1], [4, 3, 3], [4, 2, 3, 1], [3, 4, 3], [3, 4, 3], [4, 2, 3, 1], [4, 4, 1, 1], [3, 5, 2], [4, 4, 2], [3, 4, 3], [3, 4, 3], [4, 2, 3, 1], [3, 5, 2], [3, 2, 3, 1, 1], [4, 4, 2], [4, 1, 4, 1], [4, 4, 2], [3, 4, 3], [3, 4, 3], [4, 4, 2], [4, 1, 4, 1], [3, 1, 4, 1, 1], [3, 5, 2], [3, 5, 2], [4, 4, 2], [4, 2, 3, 1], [4, 2, 3, 1], [4, 2, 3, 1], [4, 1, 4, 1], [3, 4, 3], [3, 5, 2], [4, 4, 2], [4, 3, 3], [4, 3, 3], [4, 2, 3, 1], [4, 4, 2], [4, 4, 2], [4, 4, 1, 1], [3, 4, 3], [4, 2, 3, 1], [3, 4, 3], [4, 4, 2], [4, 2, 3, 1], [4, 4, 2], [4, 2, 3, 1], [3, 4, 3], [4, 4, 2], [4, 4, 2], [3, 5, 2], [4, 1, 4, 1], [3, 4, 1, 2], [4, 1, 4, 1], [4, 1, 4, 1], [4, 3, 3], [4, 3, 3], [3, 4, 3], [3, 5, 2], [3, 4, 1, 2], [3, 4, 3], [3, 4, 3], [3, 2, 2, 1, 2], [4, 3, 3], [4, 1, 4, 1], [4, 4, 2], [4, 3, 3], [4, 1, 4, 1], [4, 2, 3, 1], [4, 4, 2], [4, 2, 3, 1], [3, 4, 3], [4, 2, 3, 1], [4, 4, 2], [4, 2, 3, 1], [4, 2, 3, 1], [4, 4, 2], [4, 4, 2], [4, 2, 3, 1], [3, 4, 3], [3, 4, 3], [4, 4, 2], [4, 3, 3], [4, 2, 3, 1], [4, 4, 2], [4, 4, 2], [4, 2, 3, 1], [3, 5, 2], [4, 4, 2], [4, 2, 3, 1], [3, 4, 3], [4, 2, 3, 1], [4, 4, 2], [4, 3, 2, 1], [4, 4, 2], [3, 5, 2], [3, 4, 3], [3, 4, 1, 2], [4, 4, 2], [4, 4, 2], [3, 5, 2], [4, 2, 3, 1], [3, 4, 3], [3, 4, 3], [4, 1, 4, 1], [4, 4, 2], [4, 1, 4, 1], [4, 1, 4, 1], [4, 3, 3], [4, 4, 2], [4, 3, 3], [4, 3, 3], [4, 2, 2, 2], [4, 3, 3], [4, 4, 2], [4, 4, 2], [4, 1, 4, 1], [3, 4, 1, 2], [4, 1, 4, 1], [3, 5, 2], [3, 5, 2], [3, 4, 3], [4, 1, 4, 1], [4, 3, 3], [3, 4, 1, 2], [3, 4, 3], [4, 4, 2], [3, 4, 1, 2], [3, 5, 2], [4, 2, 3, 1], [3, 4, 3], [3, 5, 2], [3, 5, 2], [4, 3, 3], [4, 2, 3, 1], [4, 3, 3], [3, 5, 2], [4, 4, 2], [4, 3, 2, 1], [3, 4, 3], [4, 4, 2], [4, 3, 3], [4, 2, 3, 1], [3, 4, 3], [4, 2, 3, 1], [4, 3, 3], [4, 4, 2], [4, 4, 2], [3, 4, 3], [3, 4, 3], [4, 2, 3, 1], [4, 2, 3, 1], [4, 4, 2], [4, 2, 3, 1], [4, 4, 2], [4, 4, 2], [4, 3, 3], [3, 4, 3], [4, 2, 3, 1], [4, 4, 2], [4, 3, 3], [3, 5, 2], [4, 4, 2], [4, 4, 2], [3, 4, 3], [3, 4, 1, 2], [4, 2, 3, 1], [3, 2, 3, 1, 1], [4, 3, 3], [4, 3, 3], [4, 2, 3, 1], [4, 4, 2], [3, 4, 1, 2], [4, 4, 2], [3, 5, 2], [4, 2, 3, 1], [3, 5, 2], [4, 2, 3, 1], [4, 2, 3, 1], [4, 3, 3], [4, 2, 3, 1], [4, 1, 4, 1], [4, 2, 3, 1], [4, 2, 3, 1], [3, 4, 3], [4, 1, 4, 1], [4, 4, 2], [4, 4, 2], [4, 2, 3, 1], [3, 4, 3], [3, 4, 3], [4, 4, 2], [3, 5, 2], [3, 4, 3], [3, 4, 3], [4, 2, 3, 1], [4, 2, 3, 1], [3, 5, 2], [3, 5, 2], [4, 2, 3, 1], [3, 5, 2], [4, 1, 4, 1], [3, 4, 3], [3, 4, 1, 2], [4, 3, 3], [4, 4, 2], [4, 4, 2], [3, 5, 1, 1], [4, 2, 3, 1], [3, 5, 2], [3, 4, 3], [4, 3, 3], [4, 2, 3, 1], [4, 4, 2], [4, 4, 2], [3, 5, 2], [4, 4, 2], [3, 5, 1, 1], [4, 1, 4, 1], [5, 3, 2], [4, 4, 1, 1], [4, 4, 2], [4, 4, 1, 1], [4, 2, 3, 1], [4, 2, 3, 1], [4, 4, 2], [4, 4, 2], [4, 2, 3, 1], [4, 1, 4, 1], [4, 1, 4, 1], [4, 3, 3], [4, 4, 2], [3, 5, 2], [3, 4, 3], [4, 1, 4, 1], [4, 4, 2], [4, 3, 3], [4, 3, 3], [4, 1, 4, 1], [4, 4, 1, 1], [3, 4, 3], [4, 3, 3], [4, 1, 4, 1], [3, 4, 3], [3, 4, 3], [4, 4, 2], [4, 5, 1], [3, 5, 2], [3, 5, 2], [4, 4, 2], [4, 3, 3], [3, 4, 3], [3, 5, 2], [4, 1, 4, 1], [4, 2, 3, 1], [4, 2, 3, 1], [4, 5, 1], [5, 3, 2], [4, 2, 3, 1], [4, 3, 3], [4, 2, 3, 1], [4, 2, 3, 1], [4, 4, 2], [4, 4, 2], [3, 4, 3], [4, 2, 3, 1], [4, 2, 2, 1, 1], [4, 5, 1], [4, 4, 2], [3, 4, 1, 2], [3, 4, 3], [3, 4, 3], [4, 4, 2], [4, 2, 3, 1], [3, 5, 2], [3, 4, 3], [4, 4, 2], [4, 1, 4, 1], [4, 4, 2], [4, 2, 3, 1], [3, 5, 2], [4, 2, 3, 1], [4, 4, 2], [3, 5, 2], [4, 1, 4, 1], [4, 3, 1, 2], [4, 1, 4, 1], [3, 4, 3], [4, 4, 2], [4, 3, 3], [4, 3, 3], [4, 4, 2], [4, 2, 3, 1], [4, 4, 2], [4, 2, 3, 1], [3, 5, 2], [4, 4, 2], [4, 2, 3, 1], [4, 1, 4, 1], [3, 5, 2], [4, 4, 2], [3, 5, 2], [4, 2, 2, 2], [3, 5, 2], [4, 3, 3], [4, 2, 3, 1], [4, 4, 2], [4, 4, 2], [3, 4, 3], [3, 5, 2], [3, 5, 2], [3, 4, 3], [4, 4, 2], [4, 3, 3], [3, 5, 2], [4, 2, 3, 1], [4, 3, 3], [4, 2, 3, 1], [4, 2, 3, 1], [3, 4, 3], [4, 1, 4, 1], [4, 4, 2], [4, 4, 2], [3, 4, 3], [3, 4, 3], [3, 5, 1, 1], [4, 2, 3, 1], [4, 4, 2], [3, 4, 3], [4, 3, 3], [3, 5, 2], [3, 5, 2], [4, 4, 2], [4, 3, 3], [4, 2, 3, 1], [4, 2, 3, 1], [4, 5, 1], [4, 3, 2, 1], [3, 4, 3], [4, 4, 2], [4, 2, 3, 1], [3, 5, 2], [3, 5, 2], [4, 4, 2], [4, 4, 2], [3, 5, 2], [3, 4, 3], [4, 2, 3, 1], [3, 5, 2], [4, 2, 2, 2], [3, 5, 2], [3, 4, 1, 2], [4, 2, 3, 1], [4, 3, 3], [4, 4, 2], [4, 4, 2], [3, 4, 3], [4, 3, 3], [4, 4, 1, 1], [4, 1, 4, 1], [3, 4, 1, 2], [4, 1, 4, 1], [4, 3, 3], [4, 2, 3, 1], [4, 2, 3, 1], [4, 4, 2], [4, 4, 2], [3, 5, 2], [4, 4, 2], [3, 4, 1, 2], [3, 5, 2], [4, 1, 4, 1], [3, 5, 1, 1], [3, 5, 2], [4, 3, 3], [3, 5, 2], [4, 1, 4, 1], [3, 5, 2], [4, 4, 2], [3, 1, 4, 2], [3, 4, 3], [4, 2, 2, 2], [3, 4, 3], [4, 3, 3], [4, 4, 2], [4, 1, 4, 1], [3, 4, 3], [3, 4, 3], [4, 1, 4, 1], [3, 4, 1, 2], [3, 5, 2], [4, 2, 3, 1], [4, 2, 3, 1], [4, 4, 2], [4, 3, 3], [3, 5, 2], [3, 4, 3], [3, 4, 3], [4, 3, 3], [4, 5, 1], [4, 4, 2], [3, 2, 2, 1, 2], [3, 4, 3], [3, 4, 3], [3, 5, 2], [4, 4, 2], [3, 4, 3], [3, 5, 2], [4, 4, 2], [4, 2, 3, 1], [3, 5, 2], [3, 5, 2], [3, 5, 2], [3, 4, 3], [4, 4, 2], [3, 4, 3], [4, 4, 2], [4, 3, 2, 1], [4, 4, 2], [4, 3, 3], [4, 2, 3, 1], [5, 3, 2], [4, 5, 1], [3, 4, 3], [4, 4, 1, 1], [4, 2, 3, 1], [3, 5, 2], [4, 2, 2, 2], [4, 2, 3, 1], [3, 4, 3], [3, 5, 2], [3, 2, 2, 2, 1], [4, 4, 2], [4, 2, 3, 1], [4, 1, 4, 1], [4, 2, 3, 1], [4, 4, 2], [4, 4, 2], [4, 3, 3], [4, 3, 3], ...]
df['defense_schema_away'] = [form[0] for form in formations_away]
df['middle_schema_away'] = [sum_middle_values(form) for form in formations_away]
df['strikers_schema_away'] = [form[-1] for form in formations_away]
df.drop(columns=['awayTeamTacticalSchema'], inplace=True)
df.head()
| awayScore | awayTeamName | homeScore | homeTeamName | matchDate | matchWeek | aerials_lost | aerials_won | age | assisted_shots | ... | touches_att_3rd | touches_att_pen_area | touches_def_pen_area | championship_name | defense_schema_home | middle_schema_home | strikers_schema_home | defense_schema_away | middle_schema_away | strikers_schema_away | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4 | Napoli | 1 | Spezia | 2021-05-08 | 35 | 0.0 | 0.0 | 24.0 | 0.0 | ... | 0.0 | 0.0 | 42.0 | seria a | 4 | 3 | 3 | 4 | 4 | 2 |
| 1 | 4 | Bayern Munich | 0 | Schalke 04 | 2021-01-24 | 18 | 0.0 | 1.0 | 20.0 | 0.0 | ... | 20.0 | 0.0 | 4.0 | bundesliga | 4 | 5 | 1 | 4 | 5 | 1 |
| 2 | 0 | Osasuna | 0 | Levante | 2021-12-05 | 16 | 0.0 | 1.0 | 23.0 | 1.0 | ... | 14.0 | 0.0 | 3.0 | liga | 4 | 3 | 3 | 3 | 5 | 2 |
| 3 | 2 | Crystal Palace | 2 | Arsenal | 2019-10-27 | 10 | 0.0 | 1.0 | 28.0 | 0.0 | ... | 9.0 | 1.0 | 6.0 | premier league | 4 | 4 | 2 | 4 | 5 | 1 |
| 4 | 0 | Hertha BSC | 2 | Union Berlin | 2021-11-20 | 12 | 3.0 | 0.0 | 24.0 | 0.0 | ... | 16.0 | 2.0 | 1.0 | bundesliga | 3 | 5 | 2 | 4 | 5 | 1 |
5 rows × 77 columns
Now let's transform matchDate as in EDA: into year and month.
df['matchYear'] = pd.to_datetime(df['matchDate']).dt.year
df['matchMonth'] = pd.to_datetime(df['matchDate']).dt.month
df.drop(columns=['matchDate'], inplace=True)
df.head()
| awayScore | awayTeamName | homeScore | homeTeamName | matchWeek | aerials_lost | aerials_won | age | assisted_shots | assists | ... | touches_def_pen_area | championship_name | defense_schema_home | middle_schema_home | strikers_schema_home | defense_schema_away | middle_schema_away | strikers_schema_away | matchYear | matchMonth | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4 | Napoli | 1 | Spezia | 35 | 0.0 | 0.0 | 24.0 | 0.0 | 0 | ... | 42.0 | seria a | 4 | 3 | 3 | 4 | 4 | 2 | 2021 | 5 |
| 1 | 4 | Bayern Munich | 0 | Schalke 04 | 18 | 0.0 | 1.0 | 20.0 | 0.0 | 0 | ... | 4.0 | bundesliga | 4 | 5 | 1 | 4 | 5 | 1 | 2021 | 1 |
| 2 | 0 | Osasuna | 0 | Levante | 16 | 0.0 | 1.0 | 23.0 | 1.0 | 0 | ... | 3.0 | liga | 4 | 3 | 3 | 3 | 5 | 2 | 2021 | 12 |
| 3 | 2 | Crystal Palace | 2 | Arsenal | 10 | 0.0 | 1.0 | 28.0 | 0.0 | 0 | ... | 6.0 | premier league | 4 | 4 | 2 | 4 | 5 | 1 | 2019 | 10 |
| 4 | 0 | Hertha BSC | 2 | Union Berlin | 12 | 3.0 | 0.0 | 24.0 | 0.0 | 0 | ... | 1.0 | bundesliga | 3 | 5 | 2 | 4 | 5 | 1 | 2021 | 11 |
5 rows × 78 columns
cat_cols = df.select_dtypes(include='object').columns
unique_values_count = df[cat_cols].nunique()
print(unique_values_count)
awayTeamName 129 homeTeamName 122 championship_name 6 dtype: int64
We will remove team names, as they are not sequential, so we cannot use label encoding and there are too many values to use hasher without collisions. besides, the name of a team should not influence our assessment of how someone performed in a match.
df.drop(columns=['awayTeamName'], inplace=True)
df.drop(columns=['homeTeamName'], inplace=True)
For championship name we can use one hot encoding, as there are not so many different values.
df.reset_index(drop=True, inplace=True)
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
transformed_data = ohe.fit_transform(df[['championship_name']])
df_encoded = pd.DataFrame(transformed_data.toarray(), columns=ohe.get_feature_names_out(['championship_name']))
df.drop(columns=['championship_name'], inplace=True)
df = df.join(df_encoded)
df.head()
| awayScore | homeScore | matchWeek | aerials_lost | aerials_won | age | assisted_shots | assists | ball_recoveries | blocked_shots | ... | middle_schema_away | strikers_schema_away | matchYear | matchMonth | championship_name_bundesliga | championship_name_liga | championship_name_ligue 1 | championship_name_premier league | championship_name_seria a | championship_name_unknown | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4 | 1 | 35 | 0.0 | 0.0 | 24.0 | 0.0 | 0 | 1.0 | 0.0 | ... | 4 | 2 | 2021 | 5 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 1 | 4 | 0 | 18 | 0.0 | 1.0 | 20.0 | 0.0 | 0 | 11.0 | 0.0 | ... | 5 | 1 | 2021 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 0 | 0 | 16 | 0.0 | 1.0 | 23.0 | 1.0 | 0 | 10.0 | 0.0 | ... | 5 | 2 | 2021 | 12 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 2 | 2 | 10 | 0.0 | 1.0 | 28.0 | 0.0 | 0 | 16.0 | 0.0 | ... | 5 | 1 | 2019 | 10 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 4 | 0 | 2 | 12 | 3.0 | 0.0 | 24.0 | 0.0 | 0 | 6.0 | 0.0 | ... | 5 | 1 | 2021 | 11 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 81 columns
Let's check if we didn't create any NULL values by mistake.
null_counts = df.isnull().sum()
print("Null counts for each column:")
print(null_counts[null_counts>0])
Null counts for each column: Series([], dtype: int64)
cat_cols = df.select_dtypes(include='object').columns
print(f'categorical columns: {cat_cols}')
print(f'number of categorical columns: {len(cat_cols)}')
categorical columns: Index([], dtype='object') number of categorical columns: 0
No categorical values - 1.3 done:)
Automatic outlier detection using knn from pyod
from pyod.models.knn import KNN
clf = KNN(contamination=0.04)
clf.fit(df)
df['outliers'] = clf.labels_
df['outliers'].value_counts()
0 51148 1 2116 Name: outliers, dtype: int64
df = df[df['outliers']==0]
df.drop(columns=['outliers'], inplace=True)
fig, axs = plt.subplots(21, 4, figsize = (24, 108))
axs = axs.flatten()
for i, col in enumerate(df.columns):
sns.boxplot(data=df, x=col, ax=axs[i])
plt.tight_layout()
plt.show()
It didn't remove rare but important values such as a lot of goals. It did what we wanted. 1.4 done
df.to_csv('../data/for_modellers/after_feature_engineering.csv', index=False)
df=pd.read_csv('../data/for_modellers/after_feature_engineering.csv')
df['matchMonth'] = df["matchMonth"].apply(lambda x: np.sin(x * (2 * np.pi / 12)))
df['matchYear'] = df['matchYear'].replace({2019: 0, 2020: 1, 2021: 2})
'''
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
pt = PowerTransformer(method='yeo-johnson')
scaler = StandardScaler()
df_transformed = pt.fit_transform(df)
df_scaled = scaler.fit_transform(df_transformed)
df_scaled = pd.DataFrame(df_scaled, columns=df.columns)
'''
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df)
df_scaled = pd.DataFrame(df_scaled, columns=df.columns)
df_scaled.to_csv('../data/for_modellers/after_scaling.csv', index=False)
df=pd.read_csv('../data/for_modellers/after_scaling.csv')
from sklearn.cluster import KMeans
seed=42
def count_wcss_scores(X, k_max):
scores = []
for k in range(1, k_max+1):
kmeans = KMeans(n_clusters=k, random_state=seed)
kmeans.fit(X)
wcss = kmeans.score(X) * -1
scores.append(wcss)
return scores
wcss_vec = count_wcss_scores(df, 15)
x_ticks = list(range(1, len(wcss_vec) + 1))
plt.plot(x_ticks, wcss_vec, 'x-', color = "brown")
plt.xlabel('k')
plt.ylabel('Within-cluster sum of squares')
plt.title('The Elbow Method showing the optimal k')
plt.grid()
plt.show()
from sklearn.metrics import silhouette_score
def count_clustering_scores(X, cluster_num, model, score_fun):
if isinstance(cluster_num, int):
cluster_num_iter = [cluster_num]
else:
cluster_num_iter = cluster_num
scores = []
for k in cluster_num_iter:
model_instance = model(n_clusters=k, random_state = seed)
labels = model_instance.fit_predict(X)
wcss = score_fun(X, labels)
scores.append(wcss)
if isinstance(cluster_num, int):
return scores[0]
else:
return scores
cluster_num_seq = range(2, 20)
silhouette_vec = count_clustering_scores(df, cluster_num_seq, KMeans, silhouette_score)
plt.plot(cluster_num_seq, silhouette_vec, 'x-', color = "brown")
plt.xlabel('k')
plt.ylabel('Silhouette score')
plt.grid()
plt.show()
from sklearn.decomposition import PCA
import matplotlib.cm as cm
pca = PCA()
result = pca.fit_transform(df)
kmeans = KMeans(n_clusters=5, random_state=seed)
labels = kmeans.fit_predict(df)
sns.scatterplot(x = result[:, 0], y = result[:, 1], hue = labels[:], palette=sns.color_palette("hls", 5))
plt.show()
from pca import pca
# Or reduce the data towards 2 PCs
pca_2 = pca(n_components=7)
# Fit transform
results = pca_2.fit_transform(df)
# Plot explained variance
fig, ax = pca_2.plot()
# Scatter first 2 PCs
fig, ax = pca_2.scatter()
# Make biplot with the number of features
fig, ax = pca_2.biplot(n_feat=7)
[pca] >Extracting column labels from dataframe. [pca] >Extracting row labels from dataframe. [pca] >The PCA reduction is performed on the [81] columns of the input dataframe. [pca] >Fit using PCA. [pca] >Compute loadings and PCs. [pca] >Compute explained variance. [pca] >Outlier detection using Hotelling T2 test with alpha=[0.05] and n_components=[7] [pca] >Multiple test correction applied for Hotelling T2 test: [fdr_bh] [pca] >Outlier detection using SPE/DmodX with n_std=[3]
[scatterd] >INFO> Create scatterplot [scatterd] >INFO> Create scatterplot
[scatterd]> WARNING use the standardized verbose status. The status [1-6] will be deprecated in future versions. [pca] >Plot PC1 vs PC2 with loadings. [scatterd]> WARNING use the standardized verbose status. The status [1-6] will be deprecated in future versions.